setwd('/Users/mandyhong/Desktop/MATH 422')
allHC = read.csv('all_race_hc_ts.csv')
blackHC=read.csv('black_hc_ts.csv')
asianHC=read.csv('asian_hc_ts.csv')
unemp=read.csv('unemployment.csv')
cpi=read.csv('cpi.csv')
temp=read.csv('avg_temp.csv')
temp=temp[-c(1:4),]
temp=temp[c(1:360),]
temp=as.numeric(temp$Contiguous.U.S.)
temp=as.ts(temp)
non_rHC=read.csv('non_racial_hc_ts.csv')
unemp=data.frame(t(unemp))
df= data.frame("year"=c(rep(1991, 12)))
for (i in 1992:2020) {
df2= data.frame("year"=c(rep(i, 12)))
df=rbind(df,df2)
}
df_rate=data.frame("unemployment"=unemp$X1[2:13])
for (i in 2:30){
col_name <- paste0("X",i)
df_rate2=data.frame("unemployment"=unemp[col_name][2:13,])
df_rate=rbind(df_rate, df_rate2)
}
df_month=data.frame("month"=c(rep(1:12,30)))
unempDF=data.frame("year"=df$year,"month"=df_month$month,"unemployment"=df_rate$unemployment)
unempDF
## year month unemployment
## 1 1991 1 7.1
## 2 1991 2 7.3
## 3 1991 3 7.2
## 4 1991 4 6.5
## 5 1991 5 6.7
## 6 1991 6 7.0
## 7 1991 7 6.8
## 8 1991 8 6.6
## 9 1991 9 6.5
## 10 1991 10 6.5
## 11 1991 11 6.7
## 12 1991 12 6.9
## 13 1992 1 8.1
## 14 1992 2 8.2
## 15 1992 3 7.8
## 16 1992 4 7.2
## 17 1992 5 7.3
## 18 1992 6 8.0
## 19 1992 7 7.7
## 20 1992 8 7.4
## 21 1992 9 7.3
## 22 1992 10 6.9
## 23 1992 11 7.1
## 24 1992 12 7.1
## 25 1993 1 8.0
## 26 1993 2 7.8
## 27 1993 3 7.4
## 28 1993 4 6.9
## 29 1993 5 6.8
## 30 1993 6 7.2
## 31 1993 7 7.0
## 32 1993 8 6.6
## 33 1993 9 6.4
## 34 1993 10 6.4
## 35 1993 11 6.2
## 36 1993 12 6.1
## 37 1994 1 7.3
## 38 1994 2 7.1
## 39 1994 3 6.8
## 40 1994 4 6.2
## 41 1994 5 5.9
## 42 1994 6 6.2
## 43 1994 7 6.2
## 44 1994 8 5.9
## 45 1994 9 5.6
## 46 1994 10 5.4
## 47 1994 11 5.3
## 48 1994 12 5.1
## 49 1995 1 6.2
## 50 1995 2 5.9
## 51 1995 3 5.7
## 52 1995 4 5.6
## 53 1995 5 5.5
## 54 1995 6 5.8
## 55 1995 7 5.9
## 56 1995 8 5.6
## 57 1995 9 5.4
## 58 1995 10 5.2
## 59 1995 11 5.3
## 60 1995 12 5.2
## 61 1996 1 6.3
## 62 1996 2 6.0
## 63 1996 3 5.8
## 64 1996 4 5.4
## 65 1996 5 5.4
## 66 1996 6 5.5
## 67 1996 7 5.6
## 68 1996 8 5.1
## 69 1996 9 5.0
## 70 1996 10 4.9
## 71 1996 11 5.0
## 72 1996 12 5.0
## 73 1997 1 5.9
## 74 1997 2 5.7
## 75 1997 3 5.5
## 76 1997 4 4.8
## 77 1997 5 4.7
## 78 1997 6 5.2
## 79 1997 7 5.0
## 80 1997 8 4.8
## 81 1997 9 4.7
## 82 1997 10 4.4
## 83 1997 11 4.3
## 84 1997 12 4.4
## 85 1998 1 5.2
## 86 1998 2 5.0
## 87 1998 3 5.0
## 88 1998 4 4.1
## 89 1998 5 4.2
## 90 1998 6 4.7
## 91 1998 7 4.7
## 92 1998 8 4.5
## 93 1998 9 4.4
## 94 1998 10 4.2
## 95 1998 11 4.1
## 96 1998 12 4.0
## 97 1999 1 4.8
## 98 1999 2 4.7
## 99 1999 3 4.4
## 100 1999 4 4.1
## 101 1999 5 4.0
## 102 1999 6 4.5
## 103 1999 7 4.5
## 104 1999 8 4.2
## 105 1999 9 4.1
## 106 1999 10 3.8
## 107 1999 11 3.8
## 108 1999 12 3.7
## 109 2000 1 4.5
## 110 2000 2 4.4
## 111 2000 3 4.3
## 112 2000 4 3.7
## 113 2000 5 3.8
## 114 2000 6 4.1
## 115 2000 7 4.2
## 116 2000 8 4.1
## 117 2000 9 3.8
## 118 2000 10 3.6
## 119 2000 11 3.7
## 120 2000 12 3.7
## 121 2001 1 4.7
## 122 2001 2 4.6
## 123 2001 3 4.5
## 124 2001 4 4.2
## 125 2001 5 4.1
## 126 2001 6 4.7
## 127 2001 7 4.7
## 128 2001 8 4.9
## 129 2001 9 4.7
## 130 2001 10 5.0
## 131 2001 11 5.3
## 132 2001 12 5.4
## 133 2002 1 6.3
## 134 2002 2 6.1
## 135 2002 3 6.1
## 136 2002 4 5.7
## 137 2002 5 5.5
## 138 2002 6 6.0
## 139 2002 7 5.9
## 140 2002 8 5.7
## 141 2002 9 5.4
## 142 2002 10 5.3
## 143 2002 11 5.6
## 144 2002 12 5.7
## 145 2003 1 6.5
## 146 2003 2 6.4
## 147 2003 3 6.2
## 148 2003 4 5.8
## 149 2003 5 5.8
## 150 2003 6 6.5
## 151 2003 7 6.3
## 152 2003 8 6.0
## 153 2003 9 5.8
## 154 2003 10 5.6
## 155 2003 11 5.6
## 156 2003 12 5.4
## 157 2004 1 6.3
## 158 2004 2 6.0
## 159 2004 3 6.0
## 160 2004 4 5.4
## 161 2004 5 5.3
## 162 2004 6 5.8
## 163 2004 7 5.7
## 164 2004 8 5.4
## 165 2004 9 5.1
## 166 2004 10 5.1
## 167 2004 11 5.2
## 168 2004 12 5.1
## 169 2005 1 5.7
## 170 2005 2 5.8
## 171 2005 3 5.4
## 172 2005 4 4.9
## 173 2005 5 4.9
## 174 2005 6 5.2
## 175 2005 7 5.2
## 176 2005 8 4.9
## 177 2005 9 4.8
## 178 2005 10 4.6
## 179 2005 11 4.8
## 180 2005 12 4.6
## 181 2006 1 5.1
## 182 2006 2 5.1
## 183 2006 3 4.8
## 184 2006 4 4.5
## 185 2006 5 4.4
## 186 2006 6 4.8
## 187 2006 7 5.0
## 188 2006 8 4.6
## 189 2006 9 4.4
## 190 2006 10 4.1
## 191 2006 11 4.3
## 192 2006 12 4.3
## 193 2007 1 5.0
## 194 2007 2 4.9
## 195 2007 3 4.5
## 196 2007 4 4.3
## 197 2007 5 4.3
## 198 2007 6 4.7
## 199 2007 7 4.9
## 200 2007 8 4.6
## 201 2007 9 4.5
## 202 2007 10 4.4
## 203 2007 11 4.5
## 204 2007 12 4.8
## 205 2008 1 5.4
## 206 2008 2 5.2
## 207 2008 3 5.2
## 208 2008 4 4.8
## 209 2008 5 5.2
## 210 2008 6 5.7
## 211 2008 7 6.0
## 212 2008 8 6.1
## 213 2008 9 6.0
## 214 2008 10 6.1
## 215 2008 11 6.5
## 216 2008 12 7.1
## 217 2009 1 8.5
## 218 2009 2 8.9
## 219 2009 3 9.0
## 220 2009 4 8.6
## 221 2009 5 9.1
## 222 2009 6 9.7
## 223 2009 7 9.7
## 224 2009 8 9.6
## 225 2009 9 9.5
## 226 2009 10 9.5
## 227 2009 11 9.4
## 228 2009 12 9.7
## 229 2010 1 10.6
## 230 2010 2 10.4
## 231 2010 3 10.2
## 232 2010 4 9.5
## 233 2010 5 9.3
## 234 2010 6 9.6
## 235 2010 7 9.7
## 236 2010 8 9.5
## 237 2010 9 9.2
## 238 2010 10 9.0
## 239 2010 11 9.3
## 240 2010 12 9.1
## 241 2011 1 9.8
## 242 2011 2 9.5
## 243 2011 3 9.2
## 244 2011 4 8.7
## 245 2011 5 8.7
## 246 2011 6 9.3
## 247 2011 7 9.3
## 248 2011 8 9.1
## 249 2011 9 8.8
## 250 2011 10 8.5
## 251 2011 11 8.2
## 252 2011 12 8.3
## 253 2012 1 8.8
## 254 2012 2 8.7
## 255 2012 3 8.4
## 256 2012 4 7.7
## 257 2012 5 7.9
## 258 2012 6 8.4
## 259 2012 7 8.6
## 260 2012 8 8.2
## 261 2012 9 7.6
## 262 2012 10 7.5
## 263 2012 11 7.4
## 264 2012 12 7.6
## 265 2013 1 8.5
## 266 2013 2 8.1
## 267 2013 3 7.6
## 268 2013 4 7.1
## 269 2013 5 7.3
## 270 2013 6 7.8
## 271 2013 7 7.7
## 272 2013 8 7.3
## 273 2013 9 7.0
## 274 2013 10 7.0
## 275 2013 11 6.6
## 276 2013 12 6.5
## 277 2014 1 7.0
## 278 2014 2 7.0
## 279 2014 3 6.8
## 280 2014 4 5.9
## 281 2014 5 6.1
## 282 2014 6 6.3
## 283 2014 7 6.5
## 284 2014 8 6.3
## 285 2014 9 5.7
## 286 2014 10 5.5
## 287 2014 11 5.5
## 288 2014 12 5.4
## 289 2015 1 6.1
## 290 2015 2 5.8
## 291 2015 3 5.6
## 292 2015 4 5.1
## 293 2015 5 5.3
## 294 2015 6 5.5
## 295 2015 7 5.6
## 296 2015 8 5.2
## 297 2015 9 4.9
## 298 2015 10 4.8
## 299 2015 11 4.8
## 300 2015 12 4.8
## 301 2016 1 5.3
## 302 2016 2 5.2
## 303 2016 3 5.1
## 304 2016 4 4.7
## 305 2016 5 4.5
## 306 2016 6 5.1
## 307 2016 7 5.1
## 308 2016 8 5.0
## 309 2016 9 4.8
## 310 2016 10 4.7
## 311 2016 11 4.4
## 312 2016 12 4.5
## 313 2017 1 5.1
## 314 2017 2 4.9
## 315 2017 3 4.6
## 316 2017 4 4.1
## 317 2017 5 4.1
## 318 2017 6 4.5
## 319 2017 7 4.6
## 320 2017 8 4.5
## 321 2017 9 4.1
## 322 2017 10 3.9
## 323 2017 11 3.9
## 324 2017 12 3.9
## 325 2018 1 4.5
## 326 2018 2 4.4
## 327 2018 3 4.1
## 328 2018 4 3.7
## 329 2018 5 3.6
## 330 2018 6 4.2
## 331 2018 7 4.1
## 332 2018 8 3.9
## 333 2018 9 3.6
## 334 2018 10 3.5
## 335 2018 11 3.5
## 336 2018 12 3.7
## 337 2019 1 4.4
## 338 2019 2 4.1
## 339 2019 3 3.9
## 340 2019 4 3.3
## 341 2019 5 3.4
## 342 2019 6 3.8
## 343 2019 7 4.0
## 344 2019 8 3.8
## 345 2019 9 3.3
## 346 2019 10 3.3
## 347 2019 11 3.3
## 348 2019 12 3.4
## 349 2020 1 4.0
## 350 2020 2 3.8
## 351 2020 3 4.5
## 352 2020 4 14.4
## 353 2020 5 13.0
## 354 2020 6 11.2
## 355 2020 7 10.5
## 356 2020 8 8.5
## 357 2020 9 7.7
## 358 2020 10 6.6
## 359 2020 11 6.4
## 360 2020 12 6.5
str(cpi)
## 'data.frame': 382 obs. of 5 variables:
## $ Series.ID: chr "CUUR0000SA0" "CUUR0000SA0" "CUUR0000SA0" "CUUR0000SA0" ...
## $ Year : int 1991 1991 1991 1991 1991 1991 1991 1991 1991 1991 ...
## $ Period : chr "M01" "M02" "M03" "M04" ...
## $ Label : chr "1991 Jan" "1991 Feb" "1991 Mar" "1991 Apr" ...
## $ Value : num 135 135 135 135 136 ...
cpiDF=data.frame("year"=df$year,"month"=df_month$month,"cpi"=cpi["Value"][1:360,])
cpiDF
## year month cpi
## 1 1991 1 134.600
## 2 1991 2 134.800
## 3 1991 3 135.000
## 4 1991 4 135.200
## 5 1991 5 135.600
## 6 1991 6 136.000
## 7 1991 7 136.200
## 8 1991 8 136.600
## 9 1991 9 137.200
## 10 1991 10 137.400
## 11 1991 11 137.800
## 12 1991 12 137.900
## 13 1992 1 138.100
## 14 1992 2 138.600
## 15 1992 3 139.300
## 16 1992 4 139.500
## 17 1992 5 139.700
## 18 1992 6 140.200
## 19 1992 7 140.500
## 20 1992 8 140.900
## 21 1992 9 141.300
## 22 1992 10 141.800
## 23 1992 11 142.000
## 24 1992 12 141.900
## 25 1993 1 142.600
## 26 1993 2 143.100
## 27 1993 3 143.600
## 28 1993 4 144.000
## 29 1993 5 144.200
## 30 1993 6 144.400
## 31 1993 7 144.400
## 32 1993 8 144.800
## 33 1993 9 145.100
## 34 1993 10 145.700
## 35 1993 11 145.800
## 36 1993 12 145.800
## 37 1994 1 146.200
## 38 1994 2 146.700
## 39 1994 3 147.200
## 40 1994 4 147.400
## 41 1994 5 147.500
## 42 1994 6 148.000
## 43 1994 7 148.400
## 44 1994 8 149.000
## 45 1994 9 149.400
## 46 1994 10 149.500
## 47 1994 11 149.700
## 48 1994 12 149.700
## 49 1995 1 150.300
## 50 1995 2 150.900
## 51 1995 3 151.400
## 52 1995 4 151.900
## 53 1995 5 152.200
## 54 1995 6 152.500
## 55 1995 7 152.500
## 56 1995 8 152.900
## 57 1995 9 153.200
## 58 1995 10 153.700
## 59 1995 11 153.600
## 60 1995 12 153.500
## 61 1996 1 154.400
## 62 1996 2 154.900
## 63 1996 3 155.700
## 64 1996 4 156.300
## 65 1996 5 156.600
## 66 1996 6 156.700
## 67 1996 7 157.000
## 68 1996 8 157.300
## 69 1996 9 157.800
## 70 1996 10 158.300
## 71 1996 11 158.600
## 72 1996 12 158.600
## 73 1997 1 159.100
## 74 1997 2 159.600
## 75 1997 3 160.000
## 76 1997 4 160.200
## 77 1997 5 160.100
## 78 1997 6 160.300
## 79 1997 7 160.500
## 80 1997 8 160.800
## 81 1997 9 161.200
## 82 1997 10 161.600
## 83 1997 11 161.500
## 84 1997 12 161.300
## 85 1998 1 161.600
## 86 1998 2 161.900
## 87 1998 3 162.200
## 88 1998 4 162.500
## 89 1998 5 162.800
## 90 1998 6 163.000
## 91 1998 7 163.200
## 92 1998 8 163.400
## 93 1998 9 163.600
## 94 1998 10 164.000
## 95 1998 11 164.000
## 96 1998 12 163.900
## 97 1999 1 164.300
## 98 1999 2 164.500
## 99 1999 3 165.000
## 100 1999 4 166.200
## 101 1999 5 166.200
## 102 1999 6 166.200
## 103 1999 7 166.700
## 104 1999 8 167.100
## 105 1999 9 167.900
## 106 1999 10 168.200
## 107 1999 11 168.300
## 108 1999 12 168.300
## 109 2000 1 168.800
## 110 2000 2 169.800
## 111 2000 3 171.200
## 112 2000 4 171.300
## 113 2000 5 171.500
## 114 2000 6 172.400
## 115 2000 7 172.800
## 116 2000 8 172.800
## 117 2000 9 173.700
## 118 2000 10 174.000
## 119 2000 11 174.100
## 120 2000 12 174.000
## 121 2001 1 175.100
## 122 2001 2 175.800
## 123 2001 3 176.200
## 124 2001 4 176.900
## 125 2001 5 177.700
## 126 2001 6 178.000
## 127 2001 7 177.500
## 128 2001 8 177.500
## 129 2001 9 178.300
## 130 2001 10 177.700
## 131 2001 11 177.400
## 132 2001 12 176.700
## 133 2002 1 177.100
## 134 2002 2 177.800
## 135 2002 3 178.800
## 136 2002 4 179.800
## 137 2002 5 179.800
## 138 2002 6 179.900
## 139 2002 7 180.100
## 140 2002 8 180.700
## 141 2002 9 181.000
## 142 2002 10 181.300
## 143 2002 11 181.300
## 144 2002 12 180.900
## 145 2003 1 181.700
## 146 2003 2 183.100
## 147 2003 3 184.200
## 148 2003 4 183.800
## 149 2003 5 183.500
## 150 2003 6 183.700
## 151 2003 7 183.900
## 152 2003 8 184.600
## 153 2003 9 185.200
## 154 2003 10 185.000
## 155 2003 11 184.500
## 156 2003 12 184.300
## 157 2004 1 185.200
## 158 2004 2 186.200
## 159 2004 3 187.400
## 160 2004 4 188.000
## 161 2004 5 189.100
## 162 2004 6 189.700
## 163 2004 7 189.400
## 164 2004 8 189.500
## 165 2004 9 189.900
## 166 2004 10 190.900
## 167 2004 11 191.000
## 168 2004 12 190.300
## 169 2005 1 190.700
## 170 2005 2 191.800
## 171 2005 3 193.300
## 172 2005 4 194.600
## 173 2005 5 194.400
## 174 2005 6 194.500
## 175 2005 7 195.400
## 176 2005 8 196.400
## 177 2005 9 198.800
## 178 2005 10 199.200
## 179 2005 11 197.600
## 180 2005 12 196.800
## 181 2006 1 198.300
## 182 2006 2 198.700
## 183 2006 3 199.800
## 184 2006 4 201.500
## 185 2006 5 202.500
## 186 2006 6 202.900
## 187 2006 7 203.500
## 188 2006 8 203.900
## 189 2006 9 202.900
## 190 2006 10 201.800
## 191 2006 11 201.500
## 192 2006 12 201.800
## 193 2007 1 202.416
## 194 2007 2 203.499
## 195 2007 3 205.352
## 196 2007 4 206.686
## 197 2007 5 207.949
## 198 2007 6 208.352
## 199 2007 7 208.299
## 200 2007 8 207.917
## 201 2007 9 208.490
## 202 2007 10 208.936
## 203 2007 11 210.177
## 204 2007 12 210.036
## 205 2008 1 211.080
## 206 2008 2 211.693
## 207 2008 3 213.528
## 208 2008 4 214.823
## 209 2008 5 216.632
## 210 2008 6 218.815
## 211 2008 7 219.964
## 212 2008 8 219.086
## 213 2008 9 218.783
## 214 2008 10 216.573
## 215 2008 11 212.425
## 216 2008 12 210.228
## 217 2009 1 211.143
## 218 2009 2 212.193
## 219 2009 3 212.709
## 220 2009 4 213.240
## 221 2009 5 213.856
## 222 2009 6 215.693
## 223 2009 7 215.351
## 224 2009 8 215.834
## 225 2009 9 215.969
## 226 2009 10 216.177
## 227 2009 11 216.330
## 228 2009 12 215.949
## 229 2010 1 216.687
## 230 2010 2 216.741
## 231 2010 3 217.631
## 232 2010 4 218.009
## 233 2010 5 218.178
## 234 2010 6 217.965
## 235 2010 7 218.011
## 236 2010 8 218.312
## 237 2010 9 218.439
## 238 2010 10 218.711
## 239 2010 11 218.803
## 240 2010 12 219.179
## 241 2011 1 220.223
## 242 2011 2 221.309
## 243 2011 3 223.467
## 244 2011 4 224.906
## 245 2011 5 225.964
## 246 2011 6 225.722
## 247 2011 7 225.922
## 248 2011 8 226.545
## 249 2011 9 226.889
## 250 2011 10 226.421
## 251 2011 11 226.230
## 252 2011 12 225.672
## 253 2012 1 226.665
## 254 2012 2 227.663
## 255 2012 3 229.392
## 256 2012 4 230.085
## 257 2012 5 229.815
## 258 2012 6 229.478
## 259 2012 7 229.104
## 260 2012 8 230.379
## 261 2012 9 231.407
## 262 2012 10 231.317
## 263 2012 11 230.221
## 264 2012 12 229.601
## 265 2013 1 230.280
## 266 2013 2 232.166
## 267 2013 3 232.773
## 268 2013 4 232.531
## 269 2013 5 232.945
## 270 2013 6 233.504
## 271 2013 7 233.596
## 272 2013 8 233.877
## 273 2013 9 234.149
## 274 2013 10 233.546
## 275 2013 11 233.069
## 276 2013 12 233.049
## 277 2014 1 233.916
## 278 2014 2 234.781
## 279 2014 3 236.293
## 280 2014 4 237.072
## 281 2014 5 237.900
## 282 2014 6 238.343
## 283 2014 7 238.250
## 284 2014 8 237.852
## 285 2014 9 238.031
## 286 2014 10 237.433
## 287 2014 11 236.151
## 288 2014 12 234.812
## 289 2015 1 233.707
## 290 2015 2 234.722
## 291 2015 3 236.119
## 292 2015 4 236.599
## 293 2015 5 237.805
## 294 2015 6 238.638
## 295 2015 7 238.654
## 296 2015 8 238.316
## 297 2015 9 237.945
## 298 2015 10 237.838
## 299 2015 11 237.336
## 300 2015 12 236.525
## 301 2016 1 236.916
## 302 2016 2 237.111
## 303 2016 3 238.132
## 304 2016 4 239.261
## 305 2016 5 240.229
## 306 2016 6 241.018
## 307 2016 7 240.628
## 308 2016 8 240.849
## 309 2016 9 241.428
## 310 2016 10 241.729
## 311 2016 11 241.353
## 312 2016 12 241.432
## 313 2017 1 242.839
## 314 2017 2 243.603
## 315 2017 3 243.801
## 316 2017 4 244.524
## 317 2017 5 244.733
## 318 2017 6 244.955
## 319 2017 7 244.786
## 320 2017 8 245.519
## 321 2017 9 246.819
## 322 2017 10 246.663
## 323 2017 11 246.669
## 324 2017 12 246.524
## 325 2018 1 247.867
## 326 2018 2 248.991
## 327 2018 3 249.554
## 328 2018 4 250.546
## 329 2018 5 251.588
## 330 2018 6 251.989
## 331 2018 7 252.006
## 332 2018 8 252.146
## 333 2018 9 252.439
## 334 2018 10 252.885
## 335 2018 11 252.038
## 336 2018 12 251.233
## 337 2019 1 251.712
## 338 2019 2 252.776
## 339 2019 3 254.202
## 340 2019 4 255.548
## 341 2019 5 256.092
## 342 2019 6 256.143
## 343 2019 7 256.571
## 344 2019 8 256.558
## 345 2019 9 256.759
## 346 2019 10 257.346
## 347 2019 11 257.208
## 348 2019 12 256.974
## 349 2020 1 257.971
## 350 2020 2 258.678
## 351 2020 3 258.115
## 352 2020 4 256.389
## 353 2020 5 256.394
## 354 2020 6 257.797
## 355 2020 7 259.101
## 356 2020 8 259.918
## 357 2020 9 260.280
## 358 2020 10 260.388
## 359 2020 11 260.229
## 360 2020 12 260.474
str(non_rHC)
## 'data.frame': 360 obs. of 3 variables:
## $ year : int 1991 1991 1991 1991 1991 1991 1991 1991 1991 1991 ...
## $ month : int 1 2 3 4 5 6 7 8 9 10 ...
## $ hate_crime: int 102 81 69 76 90 100 111 152 144 150 ...
non_rHC
## year month hate_crime
## 1 1991 1 102
## 2 1991 2 81
## 3 1991 3 69
## 4 1991 4 76
## 5 1991 5 90
## 6 1991 6 100
## 7 1991 7 111
## 8 1991 8 152
## 9 1991 9 144
## 10 1991 10 150
## 11 1991 11 146
## 12 1991 12 97
## 13 1992 1 123
## 14 1992 2 149
## 15 1992 3 202
## 16 1992 4 199
## 17 1992 5 178
## 18 1992 6 155
## 19 1992 7 127
## 20 1992 8 173
## 21 1992 9 113
## 22 1992 10 179
## 23 1992 11 200
## 24 1992 12 140
## 25 1993 1 162
## 26 1993 2 176
## 27 1993 3 140
## 28 1993 4 198
## 29 1993 5 180
## 30 1993 6 196
## 31 1993 7 185
## 32 1993 8 187
## 33 1993 9 177
## 34 1993 10 206
## 35 1993 11 205
## 36 1993 12 144
## 37 1994 1 108
## 38 1994 2 153
## 39 1994 3 218
## 40 1994 4 180
## 41 1994 5 115
## 42 1994 6 140
## 43 1994 7 133
## 44 1994 8 150
## 45 1994 9 147
## 46 1994 10 163
## 47 1994 11 121
## 48 1994 12 126
## 49 1995 1 202
## 50 1995 2 162
## 51 1995 3 183
## 52 1995 4 235
## 53 1995 5 164
## 54 1995 6 204
## 55 1995 7 217
## 56 1995 8 171
## 57 1995 9 196
## 58 1995 10 216
## 59 1995 11 184
## 60 1995 12 162
## 61 1996 1 174
## 62 1996 2 185
## 63 1996 3 200
## 64 1996 4 245
## 65 1996 5 207
## 66 1996 6 202
## 67 1996 7 204
## 68 1996 8 210
## 69 1996 9 215
## 70 1996 10 234
## 71 1996 11 183
## 72 1996 12 186
## 73 1997 1 151
## 74 1997 2 198
## 75 1997 3 241
## 76 1997 4 220
## 77 1997 5 238
## 78 1997 6 225
## 79 1997 7 224
## 80 1997 8 185
## 81 1997 9 223
## 82 1997 10 247
## 83 1997 11 186
## 84 1997 12 174
## 85 1998 1 218
## 86 1998 2 206
## 87 1998 3 207
## 88 1998 4 244
## 89 1998 5 243
## 90 1998 6 210
## 91 1998 7 216
## 92 1998 8 232
## 93 1998 9 212
## 94 1998 10 312
## 95 1998 11 212
## 96 1998 12 207
## 97 1999 1 193
## 98 1999 2 233
## 99 1999 3 219
## 100 1999 4 262
## 101 1999 5 235
## 102 1999 6 201
## 103 1999 7 256
## 104 1999 8 262
## 105 1999 9 264
## 106 1999 10 227
## 107 1999 11 210
## 108 1999 12 197
## 109 2000 1 193
## 110 2000 2 216
## 111 2000 3 248
## 112 2000 4 250
## 113 2000 5 226
## 114 2000 6 235
## 115 2000 7 220
## 116 2000 8 212
## 117 2000 9 243
## 118 2000 10 398
## 119 2000 11 234
## 120 2000 12 186
## 121 2001 1 216
## 122 2001 2 195
## 123 2001 3 222
## 124 2001 4 273
## 125 2001 5 244
## 126 2001 6 266
## 127 2001 7 222
## 128 2001 8 255
## 129 2001 9 636
## 130 2001 10 366
## 131 2001 11 210
## 132 2001 12 193
## 133 2002 1 179
## 134 2002 2 186
## 135 2002 3 263
## 136 2002 4 290
## 137 2002 5 237
## 138 2002 6 248
## 139 2002 7 241
## 140 2002 8 186
## 141 2002 9 297
## 142 2002 10 230
## 143 2002 11 201
## 144 2002 12 162
## 145 2003 1 182
## 146 2003 2 194
## 147 2003 3 227
## 148 2003 4 238
## 149 2003 5 255
## 150 2003 6 218
## 151 2003 7 216
## 152 2003 8 206
## 153 2003 9 244
## 154 2003 10 258
## 155 2003 11 232
## 156 2003 12 168
## 157 2004 1 195
## 158 2004 2 229
## 159 2004 3 213
## 160 2004 4 264
## 161 2004 5 231
## 162 2004 6 236
## 163 2004 7 208
## 164 2004 8 208
## 165 2004 9 240
## 166 2004 10 232
## 167 2004 11 199
## 168 2004 12 173
## 169 2005 1 177
## 170 2005 2 189
## 171 2005 3 229
## 172 2005 4 250
## 173 2005 5 256
## 174 2005 6 202
## 175 2005 7 197
## 176 2005 8 199
## 177 2005 9 202
## 178 2005 10 230
## 179 2005 11 183
## 180 2005 12 158
## 181 2006 1 194
## 182 2006 2 187
## 183 2006 3 232
## 184 2006 4 241
## 185 2006 5 220
## 186 2006 6 247
## 187 2006 7 273
## 188 2006 8 243
## 189 2006 9 245
## 190 2006 10 241
## 191 2006 11 215
## 192 2006 12 202
## 193 2007 1 200
## 194 2007 2 162
## 195 2007 3 243
## 196 2007 4 252
## 197 2007 5 251
## 198 2007 6 219
## 199 2007 7 219
## 200 2007 8 243
## 201 2007 9 287
## 202 2007 10 290
## 203 2007 11 199
## 204 2007 12 167
## 205 2008 1 209
## 206 2008 2 229
## 207 2008 3 270
## 208 2008 4 250
## 209 2008 5 272
## 210 2008 6 262
## 211 2008 7 251
## 212 2008 8 257
## 213 2008 9 242
## 214 2008 10 277
## 215 2008 11 275
## 216 2008 12 187
## 217 2009 1 196
## 218 2009 2 188
## 219 2009 3 223
## 220 2009 4 232
## 221 2009 5 247
## 222 2009 6 220
## 223 2009 7 204
## 224 2009 8 243
## 225 2009 9 215
## 226 2009 10 250
## 227 2009 11 225
## 228 2009 12 176
## 229 2010 1 195
## 230 2010 2 179
## 231 2010 3 229
## 232 2010 4 263
## 233 2010 5 248
## 234 2010 6 202
## 235 2010 7 193
## 236 2010 8 251
## 237 2010 9 270
## 238 2010 10 263
## 239 2010 11 201
## 240 2010 12 151
## 241 2011 1 168
## 242 2011 2 148
## 243 2011 3 200
## 244 2011 4 225
## 245 2011 5 242
## 246 2011 6 235
## 247 2011 7 233
## 248 2011 8 235
## 249 2011 9 220
## 250 2011 10 256
## 251 2011 11 200
## 252 2011 12 243
## 253 2012 1 266
## 254 2012 2 201
## 255 2012 3 234
## 256 2012 4 240
## 257 2012 5 233
## 258 2012 6 239
## 259 2012 7 259
## 260 2012 8 223
## 261 2012 9 253
## 262 2012 10 210
## 263 2012 11 218
## 264 2012 12 155
## 265 2013 1 164
## 266 2013 2 158
## 267 2013 3 209
## 268 2013 4 217
## 269 2013 5 235
## 270 2013 6 246
## 271 2013 7 218
## 272 2013 8 220
## 273 2013 9 204
## 274 2013 10 223
## 275 2013 11 186
## 276 2013 12 179
## 277 2014 1 120
## 278 2014 2 151
## 279 2014 3 202
## 280 2014 4 206
## 281 2014 5 201
## 282 2014 6 227
## 283 2014 7 215
## 284 2014 8 240
## 285 2014 9 212
## 286 2014 10 240
## 287 2014 11 153
## 288 2014 12 139
## 289 2015 1 180
## 290 2015 2 153
## 291 2015 3 183
## 292 2015 4 228
## 293 2015 5 231
## 294 2015 6 240
## 295 2015 7 254
## 296 2015 8 223
## 297 2015 9 189
## 298 2015 10 214
## 299 2015 11 195
## 300 2015 12 246
## 301 2016 1 163
## 302 2016 2 170
## 303 2016 3 222
## 304 2016 4 228
## 305 2016 5 220
## 306 2016 6 255
## 307 2016 7 243
## 308 2016 8 197
## 309 2016 9 195
## 310 2016 10 228
## 311 2016 11 322
## 312 2016 12 226
## 313 2017 1 278
## 314 2017 2 267
## 315 2017 3 303
## 316 2017 4 233
## 317 2017 5 272
## 318 2017 6 259
## 319 2017 7 258
## 320 2017 8 271
## 321 2017 9 274
## 322 2017 10 256
## 323 2017 11 227
## 324 2017 12 195
## 325 2018 1 213
## 326 2018 2 245
## 327 2018 3 235
## 328 2018 4 212
## 329 2018 5 287
## 330 2018 6 274
## 331 2018 7 230
## 332 2018 8 265
## 333 2018 9 266
## 334 2018 10 315
## 335 2018 11 266
## 336 2018 12 245
## 337 2019 1 224
## 338 2019 2 187
## 339 2019 3 294
## 340 2019 4 264
## 341 2019 5 312
## 342 2019 6 329
## 343 2019 7 303
## 344 2019 8 287
## 345 2019 9 303
## 346 2019 10 314
## 347 2019 11 244
## 348 2019 12 282
## 349 2020 1 310
## 350 2020 2 295
## 351 2020 3 245
## 352 2020 4 196
## 353 2020 5 247
## 354 2020 6 301
## 355 2020 7 313
## 356 2020 8 350
## 357 2020 9 301
## 358 2020 10 352
## 359 2020 11 291
## 360 2020 12 269
### Get time series plots for all three data and summary statistics
#ts1 <- xts(unempDF$unemploymente, as.POSIXct(sprintf("%d-%d-01", unempDF$year, unempDF$month)))
ts1 <- xts(unempDF$unemployment, as.yearmon(unempDF$year + (unempDF$month-1)/12))
plot(ts1, main="unemployment")
ts2 <- xts(cpiDF$cpi, as.yearmon(cpiDF$year + (cpiDF$month-1)/12))
plot(ts2, main="cpi")
ts3 <- xts(temp, as.yearmon(asianHC$year + (asianHC$month-1)/12))
plot(ts3, main="average temperature")
ts4 <- xts(allHC$hate_crime, as.yearmon(allHC$year + (allHC$month-1)/12))
plot(ts4, main="all racial hate crime")
ts5 <- xts(blackHC$hate_crime, as.yearmon(blackHC$year + (blackHC$month-1)/12))
plot(ts5, main="anti-black hate crime")
ts6 <- xts(asianHC$hate_crime, as.yearmon(asianHC$year + (asianHC$month-1)/12))
plot(ts6, main="anti-black hate crime")
tsplot(diff(unempDF$unemployment))
t=1:360
fit=lm(cpi ~ t, data=cpiDF)
summary(fit)
##
## Call:
## lm(formula = cpi ~ t, data = cpiDF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.662 -2.094 0.005 1.262 11.118
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.326e+02 2.721e-01 487.2 <2e-16 ***
## t 3.615e-01 1.306e-03 276.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.576 on 358 degrees of freedom
## Multiple R-squared: 0.9953, Adjusted R-squared: 0.9953
## F-statistic: 7.658e+04 on 1 and 358 DF, p-value: < 2.2e-16
detrend_cpi=fit$residuals
tsplot(detrend_cpi)
acf(detrend_cpi)
tsplot(diff(detrend_cpi))
acf(diff(detrend_cpi))
acf(unempDF$unemployment)
tsplot(diff(unempDF$unemployment))
acf(diff(unempDF$unemployment))
tsplot(allHC$hate_crime)
acf(allHC$hate_crime)
tsplot(cpiDF$cpi)
#detrend cpi
t=1:360
fit=lm(cpi ~ t, data=cpiDF)
summary(fit)
##
## Call:
## lm(formula = cpi ~ t, data = cpiDF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.662 -2.094 0.005 1.262 11.118
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.326e+02 2.721e-01 487.2 <2e-16 ***
## t 3.615e-01 1.306e-03 276.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.576 on 358 degrees of freedom
## Multiple R-squared: 0.9953, Adjusted R-squared: 0.9953
## F-statistic: 7.658e+04 on 1 and 358 DF, p-value: < 2.2e-16
detrend_cpi=fit$residuals
tsplot(detrend_cpi)
acf(detrend_cpi)
tsplot(diff(detrend_cpi))
acf(diff(detrend_cpi))
tsplot(non_rHC$hate_crime)
acf(non_rHC$hate_crime)
new_merged=as.zoo(ts.intersect(cpi=ts(cpiDF$cpi),cpi_notrend=detrend_cpi, unemp=ts(unempDF$unemployment), temp, non_rhc= ts(non_rHC$hate_crime), hc=ts(allHC$hate_crime), blackhc=ts(blackHC$hate_crime), asianhc=ts(asianHC$hate_crime), time=ts(1:360)))
plot(log(cpiDF$cpi), log(allHC$hate_crime))
plot(log(unempDF$unemployment), log(allHC$hate_crime))
plot(cpiDF$cpi[229:360], blackHC$hate_crime[229:360], )
plot(blackHC$hate_crime[229:360], unempDF$unemployment[229:360])
plot(asianHC$hate_crime[229:360], cpiDF$cpi[229:360])
plot(asianHC$hate_crime[229:360], unempDF$unemployment[229:360])
y=asianHC$hate_crime[229:360]
x=cpiDF$cpi[229:360]
plot(x,y)
abline(lm(y ~ x), col = "orange", lwd = 3)
panel.cor <- function(x, y, ...){
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- round(cor(x, y), 2)
text(0.5, 0.5, r, cex = 1.75)
}
pairs(cbind(Hate_Crime=new_merged$hc,Black_HC=new_merged$blackhc, Asian_HC=new_merged$asianhc,CPI=new_merged$cpi,detrend_CPI=new_merged$cpi_notrend, Unemployment=new_merged$unemp,Temp=new_merged$temp, Trend=new_merged$time, non_rHC=new_merged$non_rhc),
col="lightcoral", lower.panel=panel.cor)
pairs(cbind(Hate_Crime=new_merged$hc[1:348],Black_HC=new_merged$blackhc[1:348], Asian_HC=new_merged$asianhc[1:348],CPI=new_merged$cpi[1:348],detrend_CPI=new_merged$cpi_notrend, Unemployment=new_merged$unemp[1:348],Temp=new_merged$temp[1:348], Trend=new_merged$time),
col="dodgerblue3", lower.panel=panel.cor)
#Function that I created in Lab7
testfunction<-function(x_t, n){
myspec <- mvspec(x_t, log="no")
specL=order(myspec$spec, decreasing=TRUE)[1:n]
omegaL=c()
for (i in 1:n){
omegaL[i]=myspec$freq[specL[i]]
}
#print(omegaL)
t=time(x_t)
df <- data.frame(matrix(ncol = n*2, nrow = length(x_t)))
colL=c()
for (j in 1:n){
Xcos <- paste0("Xcos",j)
colL[2*j-1]=Xcos
Xsin <- paste0("Xsin",j)
colL[2*j]=Xsin
print(omegaL[j])
df[2*j-1] = cos(2*pi*t*omegaL[j])
df[2*j] = sin(2*pi*t*omegaL[j])
#df=ts.intersect(df,Xcos, Xsin)
}
colnames(df) <-colL
#create model using new data
mod <- lm(x_t ~ ., data=df)
#return(df)
return(mod)
}
#assuming R^2 meaning adjusted R^2?
testfunction2<-function(x_t, r_squared){
myspec <- mvspec(x_t, log="no")
comparison_r=0
n=0
while (comparison_r <= r_squared){
n=n+1
Mod=testfunction(x_t,n)
comparison_r=summary(Mod)$adj.r.squared
}
return(Mod)
}
#all racial hate crime
n = length(allHC$hate_crime)
myspec <- mvspec(allHC$hate_crime, log = "no") #there are several noisy spikes at front since we did not detrend the data
#from midsemester project, detrending using lm was not satisfying. Thus, use non-parametric approach
allhcTS <- ts(allHC$hate_crime, start=c(1991, 1), end=c(2020, 12), frequency=12)
allHC_decomp = decompose(allhcTS)
plot(allHC_decomp)
detrended = na.omit(allHC_decomp$seasonal+allHC_decomp$random)
tsplot(detrended)
hc_spec =mvspec(detrended)
mvspec(allhcTS)
which(myspec$spec > 300000) #30
## [1] 30
omega1 = myspec$freq[30] #0.08333333
t = new_merged$time
Xcos = cos(2*pi*t*omega1)
Xsin = sin(2*pi*t*omega1)
mod_spec1 <- lm(new_merged$hc~Xcos+Xsin)
summary(mod_spec1) #Adjusted R-squared: 0.1503
##
## Call:
## lm(formula = new_merged$hc ~ Xcos + Xsin)
##
## Residuals:
## Min 1Q Median 3Q Max
## -182.59 -71.94 -9.86 49.94 909.07
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 389.061 5.475 71.066 < 2e-16 ***
## Xcos -54.528 7.742 -7.043 9.74e-12 ***
## Xsin -30.868 7.742 -3.987 8.12e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 103.9 on 357 degrees of freedom
## Multiple R-squared: 0.155, Adjusted R-squared: 0.1503
## F-statistic: 32.75 on 2 and 357 DF, p-value: 8.745e-14
AIC(mod_spec1)
## [1] 4369.714
plot(mod_spec1, which = 1)
tsplot(new_merged$hc)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a graphical
## parameter
## Warning in rep(y, length.out = n): 'x' is NULL so the result will be NULL
lines(ts(mod_spec1$fitted.values,start = 0),col="red")
mod_spec2=testfunction(new_merged$hc, 2) #two sets of cos, sin
## [1] 0.08333333
## [1] 0.002777778
#using the printed value, I found omega1=0.08333333, omega2=0.002777778
summary(mod_spec2) #0.4004
##
## Call:
## lm(formula = x_t ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -177.88 -42.17 -7.34 28.29 839.61
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 389.061 4.599 84.602 < 2e-16 ***
## Xcos1 -54.528 6.504 -8.384 1.21e-15 ***
## Xsin1 -30.868 6.504 -4.746 3.01e-06 ***
## Xcos2 -13.026 6.504 -2.003 0.046 *
## Xsin2 78.835 6.504 12.122 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 87.25 on 355 degrees of freedom
## Multiple R-squared: 0.4071, Adjusted R-squared: 0.4004
## F-statistic: 60.94 on 4 and 355 DF, p-value: < 2.2e-16
plot(mod_spec2, which=1)
#cycles are:
firstC=1/0.08333333
firstC
## [1] 12
secondC=1/0.002777778
secondC #is 360, meaningless
## [1] 360
#fit the model with high adj r-squared
a=testfunction2(new_merged$hc, 0.5) #6 set of Cos, Sin. Total 13 variables including intercept
## [1] 0.08333333
## [1] 0.08333333
## [1] 0.002777778
## [1] 0.08333333
## [1] 0.002777778
## [1] 0.03055556
## [1] 0.08333333
## [1] 0.002777778
## [1] 0.03055556
## [1] 0.02777778
## [1] 0.08333333
## [1] 0.002777778
## [1] 0.03055556
## [1] 0.02777778
## [1] 0.01388889
## [1] 0.08333333
## [1] 0.002777778
## [1] 0.03055556
## [1] 0.02777778
## [1] 0.01388889
## [1] 0.1666667
summary(a)
##
## Call:
## lm(formula = x_t ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -163.95 -39.98 -3.69 28.78 756.53
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 389.061 4.195 92.733 < 2e-16 ***
## Xcos1 -54.528 5.933 -9.190 < 2e-16 ***
## Xsin1 -30.868 5.933 -5.203 3.37e-07 ***
## Xcos2 -13.026 5.933 -2.195 0.028802 *
## Xsin2 78.835 5.933 13.287 < 2e-16 ***
## Xcos3 14.096 5.933 2.376 0.018059 *
## Xsin3 -23.784 5.933 -4.009 7.48e-05 ***
## Xcos4 -1.242 5.933 -0.209 0.834299
## Xsin4 -26.819 5.933 -4.520 8.49e-06 ***
## Xcos5 9.110 5.933 1.535 0.125610
## Xsin5 -20.601 5.933 -3.472 0.000582 ***
## Xcos6 -24.650 5.933 -4.155 4.11e-05 ***
## Xsin6 -14.116 5.933 -2.379 0.017893 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 79.6 on 347 degrees of freedom
## Multiple R-squared: 0.5177, Adjusted R-squared: 0.501
## F-statistic: 31.03 on 12 and 347 DF, p-value: < 2.2e-16
summary(a)$adj.r.squared #0.5009767
## [1] 0.5009767
plot(a,which=1)
tsplot(new_merged$hc)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a graphical
## parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): 'x' is NULL so the result
## will be NULL
lines(ts(a$fitted.values,start = 0),col="red")
#which(hc_spec$spec > 5000) #30, 60
#omega2 = hc_spec$freq[30] #0.08333333
#t = time(allhcTS)
#Xcos = cos(2*pi*t*omega2)
#Xsin = sin(2*pi*t*omega2)
#mod_spec3 <- lm(allhcTS~Xcos+Xsin)
#summary(mod_spec3) #Adjusted R-squared: 0.1503
#plot(mod_spec3, which = 1)
#tsplot(allhcTS)
#lines(ts(mod_spec3$fitted.values,start = 0),col="red")
#acf(mod_spec3$residuals)
#black hate crime
n = length(blackHC$hate_crime)
myspec2 <- mvspec(blackHC$hate_crime, log = "no") #there are several noisy spikes at front since we did not detrend the data
#from midsemester project, detrending using lm was not satisfying. Thus, use non-parametric approach
blackhcTS <- ts(blackHC$hate_crime, start=c(1991, 1), end=c(2020, 12), frequency=12)
blackHC_decomp = decompose(blackhcTS)
plot(blackHC_decomp)
detrended2 = na.omit(blackHC_decomp$seasonal+blackHC_decomp$random)
tsplot(detrended2)
blackhc_spec =mvspec(detrended2)
mvspec(blackhcTS)
which(myspec2$spec > 80000) #30
## [1] 30
omega2 = myspec2$freq[30] #0.08333333
t = new_merged$time
Xcos = cos(2*pi*t*omega2)
Xsin = sin(2*pi*t*omega2)
mod_spec2 <- lm(new_merged$blackhc~Xcos+Xsin)
summary(mod_spec2) #Adjusted R-squared: 0.1426
##
## Call:
## lm(formula = new_merged$blackhc ~ Xcos + Xsin)
##
## Residuals:
## Min 1Q Median 3Q Max
## -122.39 -37.79 -4.44 28.84 455.61
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 207.672 2.993 69.386 < 2e-16 ***
## Xcos -29.715 4.233 -7.020 1.12e-11 ***
## Xsin -14.912 4.233 -3.523 0.000482 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 56.79 on 357 degrees of freedom
## Multiple R-squared: 0.1474, Adjusted R-squared: 0.1426
## F-statistic: 30.85 on 2 and 357 DF, p-value: 4.387e-13
AIC(mod_spec2)
## [1] 3934.939
BIC(mod_spec2)
## [1] 3950.483
plot(mod_spec2, which = 1)
tsplot(new_merged$blackhc)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a graphical
## parameter
## Warning in rep(y, length.out = n): 'x' is NULL so the result will be NULL
lines(ts(mod_spec2$fitted.values,start = 0),col="red")
mod_spec3=testfunction(new_merged$blackhc, 2) #two sets of cos, sin
## [1] 0.08333333
## [1] 0.002777778
#using the printed value, I found omega1=0.08333333, omega2=0.002777778
summary(mod_spec3) #0.4177
##
## Call:
## lm(formula = x_t ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -114.59 -25.72 -3.55 16.85 472.57
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 207.672 2.467 84.194 < 2e-16 ***
## Xcos1 -29.715 3.488 -8.519 4.66e-16 ***
## Xsin1 -14.912 3.488 -4.275 2.46e-05 ***
## Xcos2 -12.446 3.488 -3.568 0.000409 ***
## Xsin2 43.834 3.488 12.566 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46.8 on 355 degrees of freedom
## Multiple R-squared: 0.4241, Adjusted R-squared: 0.4177
## F-statistic: 65.37 on 4 and 355 DF, p-value: < 2.2e-16
plot(mod_spec3, which=1)
#cycles are:
firstC=1/0.08333333
firstC
## [1] 12
secondC=1/0.002777778
secondC #is 360, meaningless
## [1] 360
#fit the model with high adj r-squared
b=testfunction2(new_merged$blackhc, 0.5) #5 set of Cos, Sin. Total 13 variables including intercept
## [1] 0.08333333
## [1] 0.08333333
## [1] 0.002777778
## [1] 0.08333333
## [1] 0.002777778
## [1] 0.008333333
## [1] 0.08333333
## [1] 0.002777778
## [1] 0.008333333
## [1] 0.02777778
## [1] 0.08333333
## [1] 0.002777778
## [1] 0.008333333
## [1] 0.02777778
## [1] 0.03055556
summary(b)
##
## Call:
## lm(formula = x_t ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -95.53 -25.06 -4.62 19.57 438.86
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 207.67222 2.27734 91.191 < 2e-16 ***
## Xcos1 -29.71521 3.22065 -9.226 < 2e-16 ***
## Xsin1 -14.91212 3.22065 -4.630 5.16e-06 ***
## Xcos2 -12.44581 3.22065 -3.864 0.000133 ***
## Xsin2 43.83440 3.22065 13.610 < 2e-16 ***
## Xcos3 -0.05592 3.22065 -0.017 0.986157
## Xsin3 -16.43700 3.22065 -5.104 5.49e-07 ***
## Xcos4 3.97823 3.22065 1.235 0.217577
## Xsin4 -14.91454 3.22065 -4.631 5.14e-06 ***
## Xcos5 6.68679 3.22065 2.076 0.038605 *
## Xsin5 -12.10298 3.22065 -3.758 0.000201 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 43.21 on 349 degrees of freedom
## Multiple R-squared: 0.5174, Adjusted R-squared: 0.5036
## F-statistic: 37.42 on 10 and 349 DF, p-value: < 2.2e-16
summary(b)$adj.r.squared #0.5036
## [1] 0.5035908
plot(b,which=1)
tsplot(new_merged$blackhc)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a graphical
## parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): 'x' is NULL so the result
## will be NULL
lines(ts(b$fitted.values,start = 0),col="red")
### anti-asian hate crime
n = length(asianHC$hate_crime)
myspec3 <- mvspec(asianHC$hate_crime, log = "no") #there are several noisy spikes at front since we did not detrend the data
#from midsemester project, detrending using lm was not satisfying. Thus, use non-parametric approach
asianhcTS <- ts(asianHC$hate_crime, start=c(1991, 1), end=c(2020, 12), frequency=12)
asianHC_decomp = decompose(asianhcTS)
plot(asianHC_decomp)
detrended3 = na.omit(asianHC_decomp$seasonal+asianHC_decomp$random)
tsplot(detrended3)
asianhc_spec =mvspec(detrended3)
which(myspec3$spec > 800) #1, 4
## [1] 1 4
omega3 = myspec3$freq[1] #0.03333333
1/omega3 #360 months
## [1] 360
omega3 = myspec3$freq[4] #0.03333333
1/omega3 #90 months
## [1] 90
t = new_merged$time
Xcos = cos(2*pi*t*omega3)
Xsin = sin(2*pi*t*omega3)
mod_spec3 <- lm(new_merged$asianhc~Xcos+Xsin)
summary(mod_spec3) #Adjusted R-squared: 0.02852
##
## Call:
## lm(formula = new_merged$asianhc ~ Xcos + Xsin)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.030 -6.449 -1.673 4.782 30.994
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.8444 0.4317 41.338 < 2e-16 ***
## Xcos 1.7652 0.6105 2.891 0.00407 **
## Xsin -1.2481 0.6105 -2.045 0.04164 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.19 on 357 degrees of freedom
## Multiple R-squared: 0.03394, Adjusted R-squared: 0.02852
## F-statistic: 6.27 on 2 and 357 DF, p-value: 0.002107
AIC(mod_spec3)
## [1] 2540.75
plot(mod_spec3, which = 1)
tsplot(new_merged$asianhc)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a graphical
## parameter
## Warning in rep(y, length.out = n): 'x' is NULL so the result will be NULL
lines(mod_spec3$fitted.values,col="red")
#mod_spec4=testfunction(new_merged$asianhc, 1) #two sets of cos, sin
#using the printed value, I found omega1=0.08333333, omega2=0.002777778
#summary(mod_spec4) #0.07639
#plot(mod_spec4, which=1)
#tsplot(new_merged$asianhc)
#lines(mod_spec4$fitted.values,col="red")
#fit the model with high adj r-squared
#c=testfunction2(new_merged$asianhc, 0.5)
#summary(c)
#summary(c)$adj.r.squared #0.5021
#plot(b,which=1)
#tsplot(new_merged$asianhc)
#lines(c$fitted.values,col="red")
mod_select<- lm(hc ~ . -asianhc-blackhc, data=new_merged)
summary(mod_select) #0.4148 , time NA ??
##
## Call:
## lm(formula = hc ~ . - asianhc - blackhc, data = new_merged)
##
## Residuals:
## Min 1Q Median 3Q Max
## -216.42 -41.80 -4.80 33.53 644.94
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 284.09825 28.06003 10.125 < 2e-16 ***
## cpi -1.82312 0.11326 -16.096 < 2e-16 ***
## cpi_notrend -1.57014 1.56425 -1.004 0.31618
## unemp 5.95975 2.29718 2.594 0.00987 **
## temp 1.57198 0.26684 5.891 8.94e-09 ***
## non_rhc 1.58937 0.08676 18.319 < 2e-16 ***
## time NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 70.5 on 354 degrees of freedom
## Multiple R-squared: 0.614, Adjusted R-squared: 0.6086
## F-statistic: 112.6 on 5 and 354 DF, p-value: < 2.2e-16
mod_back=step(mod_select, direction="backward", trace=0)
par(mfrow=c(2,2))
plot(mod_back)
summary(mod_back) #unemp,temp,nonrHC, cpi. adjusted R^2=0.6086
##
## Call:
## lm(formula = hc ~ cpi + unemp + temp + non_rhc, data = new_merged)
##
## Residuals:
## Min 1Q Median 3Q Max
## -217.13 -40.44 -3.83 31.87 653.96
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 291.46929 27.08237 10.762 < 2e-16 ***
## cpi -1.83070 0.11301 -16.199 < 2e-16 ***
## unemp 5.23780 2.18170 2.401 0.0169 *
## temp 1.52254 0.26226 5.805 1.43e-08 ***
## non_rhc 1.59414 0.08663 18.401 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 70.5 on 355 degrees of freedom
## Multiple R-squared: 0.6129, Adjusted R-squared: 0.6086
## F-statistic: 140.5 on 4 and 355 DF, p-value: < 2.2e-16
vif(mod_back) #all below 5
## cpi unemp temp non_rhc
## 1.311929 1.094028 1.113487 1.509698
#1. fit best lm model
#First, try using only non-hate crime-related variables
fit_hc= lm(hc~ temp + unemp + cpi + non_rhc, data=new_merged, na.action=NULL)
summary(fit_hc) # regression results
##
## Call:
## lm(formula = hc ~ temp + unemp + cpi + non_rhc, data = new_merged,
## na.action = NULL)
##
## Residuals:
## Min 1Q Median 3Q Max
## -217.13 -40.44 -3.83 31.87 653.96
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 291.46929 27.08237 10.762 < 2e-16 ***
## temp 1.52254 0.26226 5.805 1.43e-08 ***
## unemp 5.23780 2.18170 2.401 0.0169 *
## cpi -1.83070 0.11301 -16.199 < 2e-16 ***
## non_rhc 1.59414 0.08663 18.401 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 70.5 on 355 degrees of freedom
## Multiple R-squared: 0.6129, Adjusted R-squared: 0.6086
## F-statistic: 140.5 on 4 and 355 DF, p-value: < 2.2e-16
vif(fit_hc)
## temp unemp cpi non_rhc
## 1.113487 1.094028 1.311929 1.509698
plot(fit_hc$residuals)
lines(lowess(fit_hc$residuals)) #pretty linear. take residuals and fit ARMA
#2. check stationarity for the residuals and fit ARMA
x_t=fit_hc$residuals
tsplot(x_t) #concern with spikes and trend
adf.test(x_t) #did not passed
##
## Augmented Dickey-Fuller Test
##
## data: x_t
## Dickey-Fuller = -3.2323, Lag order = 7, p-value = 0.0827
## alternative hypothesis: stationary
pp.test(x_t) #pass
## Warning in pp.test(x_t): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: x_t
## Dickey-Fuller Z(alpha) = -222.16, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(x_t) #pass
## Warning in kpss.test(x_t): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: x_t
## KPSS Level = 0.17177, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(x_t)) #pass
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -223.08 -28.49 6.76 39.83 606.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.10242 0.03703 -2.766 0.00598 **
## yd.diff.lag1 -0.47013 0.06053 -7.767 8.95e-14 ***
## yd.diff.lag2 -0.32054 0.06378 -5.025 8.03e-07 ***
## yd.diff.lag3 -0.18405 0.06184 -2.976 0.00312 **
## yd.diff.lag4 -0.01820 0.05399 -0.337 0.73619
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 64.07 on 350 degrees of freedom
## Multiple R-squared: 0.2587, Adjusted R-squared: 0.2481
## F-statistic: 24.42 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -2.7657
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(x_t) #decay to 0, might be fixed with AR(1)
pacf(x_t) #AR(1) or AR(2)
mean(x_t) #close to 0
## [1] 9.341341e-16
#start with AR(1)
error_mod1=Arima(x_t, order=c(1,0,0), include.mean = FALSE)
summary(error_mod1) #AICc=4003.9 BIC=4011.64
## Series: x_t
## ARIMA(1,0,0) with zero mean
##
## Coefficients:
## ar1
## 0.4488
## s.e. 0.0471
##
## sigma^2 = 3925: log likelihood = -1999.94
## AIC=4003.87 AICc=4003.9 BIC=4011.64
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.06343757 62.55999 43.36681 39.93476 206.3167 0.8287035
## ACF1
## Training set -0.0664218
acf(error_mod1$residuals) #some lags
pacf(error_mod1$residuals) #some lags
#try AR(2)
error_mod2=Arima(x_t, order=c(2,0,0), include.mean = FALSE)
summary(error_mod2) #AICc=3998.11 BIC=4009.71
## Series: x_t
## ARIMA(2,0,0) with zero mean
##
## Coefficients:
## ar1 ar2
## 0.3833 0.1469
## s.e. 0.0521 0.0522
##
## sigma^2 = 3851: log likelihood = -1996.02
## AIC=3998.05 AICc=3998.11 BIC=4009.71
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.1232347 61.88002 42.37744 48.48712 198.3662 0.8097975
## ACF1
## Training set -0.01776657
coeftest(error_mod2) #all sig.
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.383252 0.052073 7.3599 1.841e-13 ***
## ar2 0.146884 0.052210 2.8133 0.004903 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
acf(error_mod2$residuals) #three lags seem random
pacf(error_mod2$residuals) #three lags seem random
#try auto arima
error_mod3=auto.arima(x_t)
summary(error_mod3) #(1,0,2) AICc=3992.63 BIC=4008.06
## Series: x_t
## ARIMA(1,0,2) with zero mean
##
## Coefficients:
## ar1 ma1 ma2
## 0.8909 -0.5405 -0.1081
## s.e. 0.0698 0.0922 0.0708
##
## sigma^2 = 3780: log likelihood = -1992.26
## AIC=3992.51 AICc=3992.63 BIC=4008.06
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.6995925 61.22625 41.55968 40.85384 219.5544 0.7941707
## ACF1
## Training set 0.0003125806
coeftest(error_mod3) #ma2 not sig.
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.890935 0.069808 12.7627 < 2.2e-16 ***
## ma1 -0.540512 0.092213 -5.8616 4.586e-09 ***
## ma2 -0.108135 0.070785 -1.5277 0.1266
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
acf(error_mod3$residuals) #only two lags, 10 and 25 but small
pacf(error_mod3$residuals) #only two lags, 10 and 25 but small
#try 101
error_mod4=Arima(x_t, order=c(1,0,1), include.mean=FALSE)
summary(error_mod4) # AICc=3992.66 BIC=4004.25 #better than outo arima
## Series: x_t
## ARIMA(1,0,1) with zero mean
##
## Coefficients:
## ar1 ma1
## 0.800 -0.4697
## s.e. 0.071 0.1082
##
## sigma^2 = 3792: log likelihood = -1993.3
## AIC=3992.6 AICc=3992.66 BIC=4004.25
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.3223018 61.4083 41.76911 46.61081 208.233 0.7981728 0.02286966
coeftest(error_mod4) #all sig.
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.799976 0.070979 11.2707 < 2.2e-16 ***
## ma1 -0.469693 0.108171 -4.3421 1.411e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
acf(error_mod4$residuals) #small lags at 10,25
pacf(error_mod4$residuals) #small lags at 10,25
#add more terms
error_mod5=Arima(x_t, order=c(2,0,1), include.mean=FALSE)
summary(error_mod5) # AICc=3992.43 BIC=4007.86
## Series: x_t
## ARIMA(2,0,1) with zero mean
##
## Coefficients:
## ar1 ar2 ma1
## 1.1135 -0.1826 -0.7605
## s.e. 0.1429 0.0960 0.1271
##
## sigma^2 = 3778: log likelihood = -1992.16
## AIC=3992.32 AICc=3992.43 BIC=4007.86
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.8624684 61.20797 41.4777 42.40561 222.5925 0.7926041
## ACF1
## Training set -0.002950645
coeftest(error_mod5) #all sig.
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 1.113539 0.142938 7.7904 6.680e-15 ***
## ar2 -0.182571 0.096028 -1.9012 0.05727 .
## ma1 -0.760540 0.127077 -5.9849 2.165e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
acf(error_mod5$residuals) #small lags at 10,25
pacf(error_mod5$residuals) #small lags at 10,25
#check stationarity for the best model
tsplot(error_mod4$residuals) #looks stationary, outliers concerned
adf.test(error_mod4$residuals)
## Warning in adf.test(error_mod4$residuals): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: error_mod4$residuals
## Dickey-Fuller = -5.6754, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(error_mod4$residuals)
## Warning in pp.test(error_mod4$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: error_mod4$residuals
## Dickey-Fuller Z(alpha) = -338.03, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(error_mod4$residuals)
## Warning in kpss.test(error_mod4$residuals): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: error_mod4$residuals
## KPSS Level = 0.1002, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(error_mod4$residuals)) #pass all
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -222.53 -28.89 9.26 44.36 588.33
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.13901 0.04714 -2.949 0.0034 **
## yd.diff.lag1 -0.65347 0.06472 -10.096 < 2e-16 ***
## yd.diff.lag2 -0.52137 0.06905 -7.550 3.81e-13 ***
## yd.diff.lag3 -0.33649 0.06556 -5.132 4.75e-07 ***
## yd.diff.lag4 -0.09464 0.05319 -1.779 0.0761 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 67.46 on 350 degrees of freedom
## Multiple R-squared: 0.3968, Adjusted R-squared: 0.3882
## F-statistic: 46.05 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -2.9488
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
#final model
final_mod1=Arima(new_merged$hc, order=c(1,0,1), xreg =cbind(new_merged$temp, new_merged$unemp, new_merged$cpi, new_merged$non_rhc))
summary(final_mod1) #AICc=3987.95 BIC=4018.62
## Series: new_merged$hc
## Regression with ARIMA(1,0,1) errors
##
## Coefficients:
## ar1 ma1 intercept new_merged$temp new_merged$unemp
## 0.9607 -0.7288 140.0553 1.7436 17.3957
## s.e. 0.0311 0.0751 135.1145 0.2318 4.7038
## new_merged$cpi new_merged$non_rhc
## -1.1975 1.3340
## s.e. 0.6218 0.0858
##
## sigma^2 = 3682: log likelihood = -1985.77
## AIC=3987.54 AICc=3987.95 BIC=4018.62
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.975784 60.08839 39.84037 -1.061 10.19967 0.7771935 0.09089999
tsplot(final_mod1$residuals)
acf(final_mod1$residuals, lag=50) #looks okay
pacf(final_mod1$residuals, lag=50) #5, 10, 25 lags with small sig. 3/50
adf.test(final_mod1$residuals)
## Warning in adf.test(final_mod1$residuals): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: final_mod1$residuals
## Dickey-Fuller = -6.3053, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(final_mod1$residuals)
## Warning in pp.test(final_mod1$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: final_mod1$residuals
## Dickey-Fuller Z(alpha) = -299.47, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(final_mod1$residuals)
## Warning in kpss.test(final_mod1$residuals): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: final_mod1$residuals
## KPSS Level = 0.17817, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(final_mod1$residuals)) #passed all
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -168.60 -24.25 8.89 44.94 597.31
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.17339 0.05031 -3.447 0.000637 ***
## yd.diff.lag1 -0.53456 0.06578 -8.127 7.68e-15 ***
## yd.diff.lag2 -0.45316 0.06767 -6.697 8.49e-11 ***
## yd.diff.lag3 -0.27966 0.06367 -4.393 1.49e-05 ***
## yd.diff.lag4 -0.07174 0.05331 -1.346 0.179274
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 65.89 on 350 degrees of freedom
## Multiple R-squared: 0.3519, Adjusted R-squared: 0.3426
## F-statistic: 38 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -3.4467
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
qqPlot(final_mod1$residuals) #concern with outliers on the right side but pretty normal
## [1] 354 129
shapiro.test(final_mod1$residuals) #very small, 2.2e-16. Did not passed
##
## Shapiro-Wilk normality test
##
## data: final_mod1$residuals
## W = 0.80699, p-value < 2.2e-16
#get R^2 of the model
prediction={}
for (i in 1:360){
prediction=append(final_mod1$fitted[i], prediction)
}
prediction=rev(prediction)
cv_df=data.frame("predict"=prediction, "real"=new_merged$hc)
cor(cv_df$real,cv_df$predict)^2 # R^2 = 0.714976
## [1] 0.714976
#cor(fitted(final_mod1), allHC$hate_crime)^2
RMSE=sqrt(mean((cv_df$real - cv_df$predict)^2))
N_RMSE=RMSE/(max(cv_df$real)-min(cv_df$real))
round(RMSE , digits = 3)
## [1] 60.088
round(N_RMSE, digits = 3) # 0.053
## [1] 0.053
tsplot(new_merged$hc, lwd=2)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a graphical
## parameter
## Warning in rep(y, length.out = n): 'x' is NULL so the result will be NULL
lines(final_mod1$fitted, col="red", pch=50)
#cross-validation
a=new_merged$hc[1:348,] #including only through 2018
t=1:348
sarimaMod=Arima(new_merged$hc[1:348,],order=c(1,0,1), xreg =cbind(new_merged$temp[1:348,], new_merged$unemp[1:348,], new_merged$cpi[1:348,], new_merged$non_rhc[1:348,]))
summary(sarimaMod) #AICc=3592.9 BIC=3622.99
## Series: new_merged$hc[1:348, ]
## Regression with ARIMA(1,0,1) errors
##
## Coefficients:
## ar1 ma1 intercept new_merged$temp[1:348, ]
## 0.9199 -0.7308 378.0114 1.5067
## s.e. 0.0435 0.0700 60.1012 0.1946
## new_merged$unemp[1:348, ] new_merged$cpi[1:348, ]
## -1.6573 -1.7443
## s.e. 4.3737 0.2475
## new_merged$non_rhc[1:348, ]
## 1.2742
## s.e. 0.0718
##
## sigma^2 = 2491: log likelihood = -1851.23
## AIC=3718.47 AICc=3718.89 BIC=3749.28
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.8619917 49.40817 36.08097 -0.9283992 9.558511 0.7530432
## ACF1
## Training set 0.03188773
fcast2 <- forecast(sarimaMod, xreg=cbind(rep(mean(new_merged$temp[1:348,]), 12), rep(mean(new_merged$unemp[1:348,]), 12), rep(mean(new_merged$cpi[1:348,]),12), rep(mean(new_merged$non_rhc[1:348,]),12)))
## Warning in forecast.forecast_ARIMA(sarimaMod, xreg =
## cbind(rep(mean(new_merged$temp[1:348, : xreg contains different column names
## from the xreg used in training. Please check that the regressors are in the same
## order.
autoplot(fcast2)
#real data
real_value=new_merged$hc[c(349:360)]
#compare with real data
prediction2={}
for (i in 1:12){
print(fcast2$mean[i])
prediction2=append(fcast2$mean[i], prediction2)
}
## [1] 381.2731
## [1] 381.3204
## [1] 381.364
## [1] 381.4041
## [1] 381.4409
## [1] 381.4749
## [1] 381.5061
## [1] 381.5347
## [1] 381.5611
## [1] 381.5854
## [1] 381.6078
## [1] 381.6283
prediction2=rev(prediction2)
cv_df1=data.frame("predict"=prediction2, "real"=real_value)
cor(cv_df1$real,cv_df1$predict)^2 # R^2 = 0.1312307
## [1] 0.1312307
RMSE1=sqrt(mean((cv_df1$real - cv_df1$predict)^2))
N_RMSE1=RMSE1/(max(cv_df1$real)-min(cv_df1$real))
round(RMSE1 , digits = 3)
## [1] 284.806
round(N_RMSE1, digits = 3) #0.346
## [1] 0.346
#make prediction
final_mod1=Arima(new_merged$hc, order=c(1,0,1), xreg =cbind(new_merged$temp, new_merged$unemp, new_merged$cpi, new_merged$non_rhc))
xreg1_pred=forecast(final_mod1, xreg=cbind(rep(mean(new_merged$temp), 12), rep(mean(new_merged$unemp), 12), rep(mean(new_merged$cpi),12), rep(mean(new_merged$non_rhc),12)))
## Warning in forecast.forecast_ARIMA(final_mod1, xreg =
## cbind(rep(mean(new_merged$temp), : xreg contains different column names from the
## xreg used in training. Please check that the regressors are in the same order.
autoplot(xreg1_pred)+xlab("Time")+ylab("all racial hate crimes")
xreg2_pred=forecast(final_mod1, xreg=cbind(rep(mean(new_merged$temp), 24), rep(mean(new_merged$unemp), 24), rep(mean(new_merged$cpi),24), rep(mean(new_merged$non_rhc),24)))
## Warning in forecast.forecast_ARIMA(final_mod1, xreg =
## cbind(rep(mean(new_merged$temp), : xreg contains different column names from the
## xreg used in training. Please check that the regressors are in the same order.
autoplot(xreg2_pred)+xlab("Time")+ylab("all racial hate crimes")
#black HC
mod_select2<- lm(blackhc ~ . -asianhc-hc, data=new_merged)
summary(mod_select2) #0.4148 , time NA ??
##
## Call:
## lm(formula = blackhc ~ . - asianhc - hc, data = new_merged)
##
## Residuals:
## Min 1Q Median 3Q Max
## -223.32 -24.51 -3.06 18.68 453.75
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 172.17548 18.67241 9.221 < 2e-16 ***
## cpi -0.85810 0.07537 -11.385 < 2e-16 ***
## cpi_notrend -0.21537 1.04093 -0.207 0.8362
## unemp 2.54736 1.52865 1.666 0.0965 .
## temp 1.02200 0.17757 5.755 1.87e-08 ***
## non_rhc 0.62237 0.05773 10.780 < 2e-16 ***
## time NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46.91 on 354 degrees of freedom
## Multiple R-squared: 0.423, Adjusted R-squared: 0.4148
## F-statistic: 51.9 on 5 and 354 DF, p-value: < 2.2e-16
mod_back2=step(mod_select2, direction="backward", trace=0)
plot(mod_back2)
summary(mod_back2) #unemp,temp,nonrHC, cpi. adjusted R^2=0.4164
##
## Call:
## lm(formula = blackhc ~ cpi + unemp + temp + non_rhc, data = new_merged)
##
## Residuals:
## Min 1Q Median 3Q Max
## -223.45 -24.28 -3.28 18.83 454.99
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 173.18652 17.99733 9.623 < 2e-16 ***
## cpi -0.85914 0.07510 -11.440 < 2e-16 ***
## unemp 2.44833 1.44983 1.689 0.0922 .
## temp 1.01522 0.17428 5.825 1.28e-08 ***
## non_rhc 0.62303 0.05757 10.822 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46.85 on 355 degrees of freedom
## Multiple R-squared: 0.4229, Adjusted R-squared: 0.4164
## F-statistic: 65.04 on 4 and 355 DF, p-value: < 2.2e-16
vif(mod_back2) #all below 5
## cpi unemp temp non_rhc
## 1.311929 1.094028 1.113487 1.509698
#1. fit best lm model
fit_hc2= lm(blackhc~ temp + unemp + cpi + non_rhc, data=new_merged, na.action=NULL)
summary(fit_hc2) # regression results
##
## Call:
## lm(formula = blackhc ~ temp + unemp + cpi + non_rhc, data = new_merged,
## na.action = NULL)
##
## Residuals:
## Min 1Q Median 3Q Max
## -223.45 -24.28 -3.28 18.83 454.99
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 173.18652 17.99733 9.623 < 2e-16 ***
## temp 1.01522 0.17428 5.825 1.28e-08 ***
## unemp 2.44833 1.44983 1.689 0.0922 .
## cpi -0.85914 0.07510 -11.440 < 2e-16 ***
## non_rhc 0.62303 0.05757 10.822 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46.85 on 355 degrees of freedom
## Multiple R-squared: 0.4229, Adjusted R-squared: 0.4164
## F-statistic: 65.04 on 4 and 355 DF, p-value: < 2.2e-16
vif(fit_hc2)
## temp unemp cpi non_rhc
## 1.113487 1.094028 1.311929 1.509698
plot(fit_hc2$residuals)
lines(lowess(fit_hc2$residuals)) #pretty linear. take residuals and fit ARMA
#2. check stationarity for the residuals and fit ARMA
x_t2=fit_hc2$residuals
tsplot(x_t2) #concern with spikes and trend
adf.test(x_t2) #did not passed
##
## Augmented Dickey-Fuller Test
##
## data: x_t2
## Dickey-Fuller = -3.2257, Lag order = 7, p-value = 0.08382
## alternative hypothesis: stationary
pp.test(x_t2) #pass
## Warning in pp.test(x_t2): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: x_t2
## Dickey-Fuller Z(alpha) = -163.48, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(x_t2) #pass
## Warning in kpss.test(x_t2): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: x_t2
## KPSS Level = 0.20784, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(x_t2)) #pass 5pct
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -212.37 -13.31 3.49 22.39 443.79
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.05583 0.02705 -2.064 0.03972 *
## yd.diff.lag1 -0.40120 0.05726 -7.007 1.26e-11 ***
## yd.diff.lag2 -0.30311 0.05979 -5.070 6.46e-07 ***
## yd.diff.lag3 -0.18723 0.05869 -3.190 0.00155 **
## yd.diff.lag4 -0.06528 0.05399 -1.209 0.22741
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.28 on 350 degrees of freedom
## Multiple R-squared: 0.191, Adjusted R-squared: 0.1794
## F-statistic: 16.53 on 5 and 350 DF, p-value: 1.186e-14
##
##
## Value of test-statistic is: -2.0644
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(x_t2) #decay to 0, might be fixed with AR(1)
pacf(x_t2) #AR(1)
mean(x_t2) #close to 0
## [1] 4.658626e-15
#start with AR(1)
error_mod1.2=Arima(x_t2, order=c(1,0,0), include.mean = FALSE)
summary(error_mod1.2) #AICc=3649.91 BIC=3657.65
## Series: x_t2
## ARIMA(1,0,0) with zero mean
##
## Coefficients:
## ar1
## 0.5700
## s.e. 0.0434
##
## sigma^2 = 1467: log likelihood = -1822.94
## AIC=3649.88 AICc=3649.91 BIC=3657.65
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.050652 38.25346 24.15232 72.73581 232.2421 0.8515898 -0.07009319
acf(error_mod1.2$residuals, 50) #2/50
pacf(error_mod1.2$residuals, 50) #1/50 looks good
#start with AR(1)
error_mod1.3=Arima(x_t2, order=c(1,0,1), include.mean = FALSE)
summary(error_mod1.3) # AICc=3643.67 BIC=3655.26
## Series: x_t2
## ARIMA(1,0,1) with zero mean
##
## Coefficients:
## ar1 ma1
## 0.7775 -0.3241
## s.e. 0.0655 0.1054
##
## sigma^2 = 1438: log likelihood = -1818.8
## AIC=3643.6 AICc=3643.67 BIC=3655.26
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.1599784 37.81305 23.33107 67.89707 233.5927 0.8226335 0.021852
acf(error_mod1.3$residuals, 50) #good
pacf(error_mod1.3$residuals, 50) #1/50 looks good
#add terms
error_mod1.4=Arima(x_t2, order=c(2,0,1), include.mean = FALSE)
summary(error_mod1.4) # AICc=3638.91 BIC=3654.34
## Series: x_t2
## ARIMA(2,0,1) with zero mean
##
## Coefficients:
## ar1 ar2 ma1
## 1.2920 -0.3249 -0.8277
## s.e. 0.0949 0.0782 0.0728
##
## sigma^2 = 1414: log likelihood = -1815.4
## AIC=3638.8 AICc=3638.91 BIC=3654.34
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.7873559 37.44777 22.74578 29.64259 289.8964 0.8019963
## ACF1
## Training set -0.002241382
acf(error_mod1.4$residuals, 50) #good
pacf(error_mod1.4$residuals, 50) #good
#add terms
error_mod1.5=Arima(x_t2, order=c(2,0,2), include.mean = FALSE)
summary(error_mod1.5) #AICc=3640.95 BIC=3660.21 got worse
## Series: x_t2
## ARIMA(2,0,2) with zero mean
##
## Coefficients:
## ar1 ar2 ma1 ma2
## 1.2645 -0.2989 -0.7980 -0.0212
## s.e. 0.2511 0.2300 0.2567 0.1666
##
## sigma^2 = 1418: log likelihood = -1815.39
## AIC=3640.78 AICc=3640.95 BIC=3660.21
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.79204 37.44663 22.74146 28.39091 291.2595 0.8018441 -0.004337836
acf(error_mod1.5$residuals, 50) #good
pacf(error_mod1.5$residuals, 50) #good
#auto arima
error_mod1.6=auto.arima(x_t2)
summary(error_mod1.6) #2,0,1
## Series: x_t2
## ARIMA(2,0,1) with zero mean
##
## Coefficients:
## ar1 ar2 ma1
## 1.2920 -0.3249 -0.8277
## s.e. 0.0949 0.0782 0.0728
##
## sigma^2 = 1414: log likelihood = -1815.4
## AIC=3638.8 AICc=3638.91 BIC=3654.34
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.7873559 37.44777 22.74578 29.64259 289.8964 0.8019963
## ACF1
## Training set -0.002241382
acf(error_mod1.6$residuals, 50) #good
pacf(error_mod1.6$residuals, 50) #good
#check stationarity for the best model
tsplot(error_mod1.4$residuals) #looks stationary, outliers concerned
adf.test(error_mod1.4$residuals)
## Warning in adf.test(error_mod1.4$residuals): p-value smaller than printed p-
## value
##
## Augmented Dickey-Fuller Test
##
## data: error_mod1.4$residuals
## Dickey-Fuller = -6.2597, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(error_mod1.4$residuals)
## Warning in pp.test(error_mod1.4$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: error_mod1.4$residuals
## Dickey-Fuller Z(alpha) = -355.99, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(error_mod1.4$residuals)
## Warning in kpss.test(error_mod1.4$residuals): p-value greater than printed p-
## value
##
## KPSS Test for Level Stationarity
##
## data: error_mod1.4$residuals
## KPSS Level = 0.072607, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(error_mod1.4$residuals)) #pass all
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -202.38 -14.25 4.47 22.49 439.70
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.08316 0.03702 -2.246 0.0253 *
## yd.diff.lag1 -0.73245 0.06045 -12.117 < 2e-16 ***
## yd.diff.lag2 -0.55559 0.06821 -8.145 6.75e-15 ***
## yd.diff.lag3 -0.35583 0.06605 -5.387 1.31e-07 ***
## yd.diff.lag4 -0.13673 0.05296 -2.582 0.0102 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 41.23 on 350 degrees of freedom
## Multiple R-squared: 0.4052, Adjusted R-squared: 0.3967
## F-statistic: 47.68 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -2.2462
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
#final model
final_mod2=Arima(new_merged$blackhc, order=c(2,0,1), xreg =cbind(new_merged$temp, new_merged$unemp, new_merged$cpi, new_merged$non_rhc))
summary(final_mod2) #AICc=3602.76 BIC=3637.22
## Series: new_merged$blackhc
## Regression with ARIMA(2,0,1) errors
##
## Coefficients:
## ar1 ar2 ma1 intercept new_merged$temp new_merged$unemp
## 1.3523 -0.3618 -0.853 -4.1477 1.3253 5.4139
## s.e. 0.0753 0.0723 0.043 148.5399 0.1772 2.7158
## new_merged$cpi new_merged$non_rhc
## 0.2386 0.2815
## s.e. 0.7318 0.0511
##
## sigma^2 = 1257: log likelihood = -1792.12
## AIC=3602.24 AICc=3602.76 BIC=3637.22
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.5938505 35.06441 21.45948 -1.549571 10.29595 0.7554376
## ACF1
## Training set 0.02318245
acf(final_mod2$residuals, lag=50) #looks okay
pacf(final_mod2$residuals, lag=50) #looks good
adf.test(final_mod2$residuals)
## Warning in adf.test(final_mod2$residuals): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: final_mod2$residuals
## Dickey-Fuller = -5.7921, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(final_mod2$residuals)
## Warning in pp.test(final_mod2$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: final_mod2$residuals
## Dickey-Fuller Z(alpha) = -336.08, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(final_mod2$residuals)
## Warning in kpss.test(final_mod2$residuals): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: final_mod2$residuals
## KPSS Level = 0.33501, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(final_mod2$residuals)) #passed all
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -79.46 -12.34 5.41 24.89 448.71
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.27119 0.06651 -4.077 5.64e-05 ***
## yd.diff.lag1 -0.55479 0.07429 -7.467 6.57e-13 ***
## yd.diff.lag2 -0.46727 0.07367 -6.343 6.98e-10 ***
## yd.diff.lag3 -0.32171 0.06707 -4.797 2.39e-06 ***
## yd.diff.lag4 -0.12645 0.05321 -2.376 0.018 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37.89 on 350 degrees of freedom
## Multiple R-squared: 0.4132, Adjusted R-squared: 0.4049
## F-statistic: 49.3 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -4.0775
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
qqPlot(final_mod2$residuals) #concern with outliers on the right side but pretty normal
## [1] 354 61
shapiro.test(final_mod2$residuals) #very small, 2.2e-16. Did not passed
##
## Shapiro-Wilk normality test
##
## data: final_mod2$residuals
## W = 0.7041, p-value < 2.2e-16
#get R^2 of the model
prediction={}
for (i in 1:360){
prediction=append(final_mod2$fitted[i], prediction)
}
prediction=rev(prediction)
cv_df=data.frame("predict"=prediction, "real"=new_merged$blackhc)
cor(cv_df$real,cv_df$predict)^2 # R^2 = 0.6724792
## [1] 0.6724792
RMSE=sqrt(mean((cv_df$real - cv_df$predict)^2))
N_RMSE=RMSE/(max(cv_df$real)-min(cv_df$real))
round(RMSE , digits = 3)
## [1] 35.064
round(N_RMSE, digits = 3) # 0.058
## [1] 0.058
tsplot(new_merged$blackhc, lwd=2)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a graphical
## parameter
## Warning in rep(y, length.out = n): 'x' is NULL so the result will be NULL
lines(final_mod2$fitted, col="red", pch=50)
#cross-validation
a=new_merged$blackhc[1:336,] #including only through 2019
t=1:336
sarimaMod2=Arima(new_merged$blackhc[1:336,],order=c(2,0,1), xreg =cbind(new_merged$temp[1:336,], new_merged$unemp[1:336,], new_merged$cpi[1:336,], new_merged$non_rhc[1:336,]))
summary(sarimaMod2) #AICc=3718.89 BIC=3749.28
## Series: new_merged$blackhc[1:336, ]
## Regression with ARIMA(2,0,1) errors
##
## Coefficients:
## ar1 ar2 ma1 intercept new_merged$temp[1:336, ]
## 1.1975 -0.2064 -0.7680 3.8370 1.1595
## s.e. 0.0817 0.0782 0.0534 225.6145 0.1206
## new_merged$unemp[1:336, ] new_merged$cpi[1:336, ]
## 2.1096 0.3094
## s.e. 3.4498 1.0811
## new_merged$non_rhc[1:336, ]
## 0.2459
## s.e. 0.0379
##
## sigma^2 = 667.8: log likelihood = -1566.39
## AIC=3150.78 AICc=3151.33 BIC=3185.14
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.3559803 25.53286 19.80543 -1.169533 9.838228 0.7524177
## ACF1
## Training set 0.001820025
fcast3 <- forecast(sarimaMod2, xreg=cbind(rep(mean(new_merged$temp[1:336,]), 12), rep(mean(new_merged$unemp[1:336,]), 12), rep(mean(new_merged$cpi[1:336,]),12), rep(mean(new_merged$non_rhc[1:336,]),12)))
## Warning in forecast.forecast_ARIMA(sarimaMod2, xreg =
## cbind(rep(mean(new_merged$temp[1:336, : xreg contains different column names
## from the xreg used in training. Please check that the regressors are in the same
## order.
autoplot(fcast3)
#real data
real_value=new_merged$blackhc[c(337:348)]
#compare with real data
prediction2={}
for (i in 1:12){
print(fcast3$mean[i])
prediction2=append(fcast3$mean[i], prediction2)
}
## [1] 152.6256
## [1] 149.4486
## [1] 149.1586
## [1] 149.467
## [1] 149.8962
## [1] 150.3464
## [1] 150.797
## [1] 151.2436
## [1] 151.6855
## [1] 152.1224
## [1] 152.5544
## [1] 152.9815
prediction2=rev(prediction2)
cv_df1=data.frame("predict"=prediction2, "real"=real_value)
cor(cv_df1$real,cv_df1$predict)^2 # R^2 = 0.01080143
## [1] 0.01080143
RMSE1=sqrt(mean((cv_df1$real - cv_df1$predict)^2))
N_RMSE1=RMSE1/(max(cv_df1$real)-min(cv_df1$real))
round(RMSE1 , digits = 3)
## [1] 36.944
round(N_RMSE1, digits = 3) #0.528
## [1] 0.528
#make prediction
xreg3_pred=forecast(final_mod2, xreg=cbind(rep(mean(new_merged$temp), 12), rep(mean(new_merged$unemp), 12), rep(mean(new_merged$cpi),12), rep(mean(new_merged$non_rhc),12)))
## Warning in forecast.forecast_ARIMA(final_mod2, xreg =
## cbind(rep(mean(new_merged$temp), : xreg contains different column names from the
## xreg used in training. Please check that the regressors are in the same order.
autoplot(xreg3_pred)+xlab("Time")+ylab("anti-Black or African American hate crimes")
xreg3_pred=forecast(final_mod2, xreg=cbind(rep(mean(new_merged$temp), 24), rep(mean(new_merged$unemp), 24), rep(mean(new_merged$cpi),24), rep(mean(new_merged$non_rhc),24)))
## Warning in forecast.forecast_ARIMA(final_mod2, xreg =
## cbind(rep(mean(new_merged$temp), : xreg contains different column names from the
## xreg used in training. Please check that the regressors are in the same order.
autoplot(xreg3_pred)+xlab("Time")+ylab("anti-Black or African American hate crimes")
#asian HC
mod_select3<- lm(asianhc ~ . -blackhc-hc-cpi_notrend, data=new_merged)
summary(mod_select3) #0.382 ,
##
## Call:
## lm(formula = asianhc ~ . - blackhc - hc - cpi_notrend, data = new_merged)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.013 -4.073 -1.121 3.102 40.401
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 95.277000 18.702777 5.094 5.71e-07 ***
## cpi -0.628370 0.144624 -4.345 1.82e-05 ***
## unemp 0.232443 0.212856 1.092 0.275567
## temp 0.035546 0.024726 1.438 0.151425
## non_rhc 0.055613 0.008039 6.918 2.16e-11 ***
## time 0.174372 0.052402 3.328 0.000968 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.533 on 354 degrees of freedom
## Multiple R-squared: 0.3906, Adjusted R-squared: 0.382
## F-statistic: 45.38 on 5 and 354 DF, p-value: < 2.2e-16
mod_back3=step(mod_select3, direction="backward", trace=0)
plot(mod_back3)
summary(mod_back3) #time,temp,nonrHC, cpi. adjusted R^2=0.3817
##
## Call:
## lm(formula = asianhc ~ cpi + temp + non_rhc + time, data = new_merged)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.805 -3.990 -1.220 3.146 40.384
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90.198881 18.120360 4.978 1.01e-06 ***
## cpi -0.577043 0.136810 -4.218 3.13e-05 ***
## temp 0.036042 0.024728 1.458 0.14585
## non_rhc 0.053362 0.007773 6.865 2.97e-11 ***
## time 0.156456 0.049780 3.143 0.00181 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.534 on 355 degrees of freedom
## Multiple R-squared: 0.3885, Adjusted R-squared: 0.3817
## F-statistic: 56.39 on 4 and 355 DF, p-value: < 2.2e-16
vif(mod_back3) #all below 5
## cpi temp non_rhc time
## 223.807553 1.152376 1.414682 225.650992
#1. fit best lm model
fit_hc3= lm(blackhc~ temp + cpi + non_rhc +time, data=new_merged, na.action=NULL)
summary(fit_hc3) # regression results
##
## Call:
## lm(formula = blackhc ~ temp + cpi + non_rhc + time, data = new_merged,
## na.action = NULL)
##
## Residuals:
## Min 1Q Median 3Q Max
## -215.33 -25.31 -3.21 16.91 469.47
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 145.07366 130.42298 1.112 0.267
## temp 1.02744 0.17798 5.773 1.7e-08 ***
## cpi -0.51098 0.98470 -0.519 0.604
## non_rhc 0.59771 0.05594 10.684 < 2e-16 ***
## time -0.11849 0.35830 -0.331 0.741
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 47.03 on 355 degrees of freedom
## Multiple R-squared: 0.4184, Adjusted R-squared: 0.4119
## F-statistic: 63.86 on 4 and 355 DF, p-value: < 2.2e-16
vif(fit_hc3)
## temp cpi non_rhc time
## 1.152376 223.807553 1.414682 225.650992
plot(fit_hc3$residuals)
lines(lowess(fit_hc3$residuals)) #pretty linear. take residuals and fit ARMA
#2. check stationarity for the residuals and fit ARMA
x_t3=fit_hc3$residuals
tsplot(x_t3) #concern with spikes and trend
adf.test(x_t3) #did not passed
##
## Augmented Dickey-Fuller Test
##
## data: x_t3
## Dickey-Fuller = -3.2932, Lag order = 7, p-value = 0.07241
## alternative hypothesis: stationary
pp.test(x_t3) #pass
## Warning in pp.test(x_t3): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: x_t3
## Dickey-Fuller Z(alpha) = -157.93, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(x_t3) #pass
## Warning in kpss.test(x_t3): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: x_t3
## KPSS Level = 0.18798, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(x_t3)) #pass 5pct
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -201.84 -13.41 3.73 21.00 446.75
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.05853 0.02722 -2.150 0.03223 *
## yd.diff.lag1 -0.38172 0.05725 -6.667 1.02e-10 ***
## yd.diff.lag2 -0.27221 0.05952 -4.573 6.68e-06 ***
## yd.diff.lag3 -0.17572 0.05848 -3.005 0.00285 **
## yd.diff.lag4 -0.05995 0.05406 -1.109 0.26823
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.16 on 350 degrees of freedom
## Multiple R-squared: 0.1787, Adjusted R-squared: 0.1669
## F-statistic: 15.23 on 5 and 350 DF, p-value: 1.519e-13
##
##
## Value of test-statistic is: -2.1501
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
acf(x_t3) #decay to 0, might be fixed with AR(1)
pacf(x_t3) #AR(1)
mean(x_t3) #close to 0
## [1] -3.795264e-15
#start with AR(1)
error_mod2.1=Arima(x_t3, order=c(1,0,0), include.mean = FALSE)
summary(error_mod2.1) #AICc=3645.18 BIC=3652.92
## Series: x_t3
## ARIMA(1,0,0) with zero mean
##
## Coefficients:
## ar1
## 0.5821
## s.e. 0.0429
##
## sigma^2 = 1448: log likelihood = -1820.57
## AIC=3645.15 AICc=3645.18 BIC=3652.92
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.05611259 38.00186 23.8664 65.26358 174.9703 0.8518535
## ACF1
## Training set -0.07271268
acf(error_mod2.1$residuals, 50) #looks good
pacf(error_mod2.1$residuals, 50) #looks good
#add terms
error_mod2.2=Arima(x_t3, order=c(1,0,1), include.mean = FALSE)
summary(error_mod2.2) # ICc=3639.66 BIC=3651.25
## Series: x_t3
## ARIMA(1,0,1) with zero mean
##
## Coefficients:
## ar1 ma1
## 0.7643 -0.2869
## s.e. 0.0632 0.0989
##
## sigma^2 = 1422: log likelihood = -1816.79
## AIC=3639.59 AICc=3639.66 BIC=3651.25
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.149849 37.6022 23.11698 68.77424 172.2578 0.8251048 0.01373493
acf(error_mod2.2$residuals, 50) #good
pacf(error_mod2.2$residuals, 50) #1/50 looks good
#add terms
error_mod2.3=Arima(x_t3, order=c(2,0,1), include.mean = FALSE)
summary(error_mod2.3) # AICc=3637.11 BIC=3652.54
## Series: x_t3
## ARIMA(2,0,1) with zero mean
##
## Coefficients:
## ar1 ar2 ma1
## 1.3057 -0.3431 -0.8179
## s.e. 0.1101 0.0874 0.0893
##
## sigma^2 = 1407: log likelihood = -1814.5
## AIC=3637 AICc=3637.11 BIC=3652.54
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.7075085 37.3557 22.57958 68.9171 180.8811 0.8059235 -0.007267818
acf(error_mod2.3$residuals, 50) #good
pacf(error_mod2.3$residuals, 50) #good
#add terms
error_mod2.4=Arima(x_t3, order=c(2,0,2), include.mean = FALSE)
summary(error_mod2.4) #AICc=3639.11 BIC=3658.37 got worse
## Series: x_t3
## ARIMA(2,0,2) with zero mean
##
## Coefficients:
## ar1 ar2 ma1 ma2
## 1.3556 -0.3892 -0.8715 0.0352
## s.e. 0.2395 0.2160 0.2475 0.1540
##
## sigma^2 = 1411: log likelihood = -1814.47
## AIC=3638.94 AICc=3639.11 BIC=3658.37
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.7070318 37.3532 22.58601 69.18505 181.2759 0.806153 -0.00365553
acf(error_mod2.4$residuals, 50) #good
pacf(error_mod2.4$residuals, 50) #good
#auto arima
error_mod2.5=auto.arima(x_t3)
summary(error_mod2.5) #2,0,1
## Series: x_t3
## ARIMA(2,0,1) with zero mean
##
## Coefficients:
## ar1 ar2 ma1
## 1.3057 -0.3431 -0.8179
## s.e. 0.1101 0.0874 0.0893
##
## sigma^2 = 1407: log likelihood = -1814.5
## AIC=3637 AICc=3637.11 BIC=3652.54
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.7075085 37.3557 22.57958 68.9171 180.8811 0.8059235 -0.007267818
acf(error_mod2.5$residuals, 50) #good
pacf(error_mod2.5$residuals, 50) #good
#check stationarity for the best model
tsplot(error_mod2.3$residuals) #looks stationary, outliers concerned
adf.test(error_mod2.3$residuals)
## Warning in adf.test(error_mod2.3$residuals): p-value smaller than printed p-
## value
##
## Augmented Dickey-Fuller Test
##
## data: error_mod2.3$residuals
## Dickey-Fuller = -6.2479, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(error_mod2.3$residuals)
## Warning in pp.test(error_mod2.3$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: error_mod2.3$residuals
## Dickey-Fuller Z(alpha) = -361.13, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(error_mod2.3$residuals)
## Warning in kpss.test(error_mod2.3$residuals): p-value greater than printed p-
## value
##
## KPSS Test for Level Stationarity
##
## data: error_mod2.3$residuals
## KPSS Level = 0.067833, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(error_mod2.3$residuals)) #pass all
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -191.78 -15.07 4.57 21.83 439.03
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.08515 0.03745 -2.274 0.0236 *
## yd.diff.lag1 -0.73747 0.06060 -12.170 < 2e-16 ***
## yd.diff.lag2 -0.54616 0.06848 -7.975 2.19e-14 ***
## yd.diff.lag3 -0.35418 0.06631 -5.341 1.67e-07 ***
## yd.diff.lag4 -0.13696 0.05295 -2.586 0.0101 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 41.09 on 350 degrees of freedom
## Multiple R-squared: 0.4092, Adjusted R-squared: 0.4008
## F-statistic: 48.48 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -2.2735
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
#final model
final_mod3=Arima(new_merged$asianhc, order=c(2,0,1), xreg =cbind(new_merged$temp, new_merged$cpi, new_merged$non_rhc, new_merged$time))
summary(final_mod3) #AICc=2277.47 BIC=2311.93
## Series: new_merged$asianhc
## Regression with ARIMA(2,0,1) errors
##
## Coefficients:
## ar1 ar2 ma1 intercept new_merged$temp new_merged$cpi
## 1.1594 -0.1970 -0.8161 59.2817 0.0474 -0.3362
## s.e. 0.1131 0.0859 0.0939 40.0799 0.0259 0.3028
## new_merged$non_rhc new_merged$time
## 0.0367 0.0817
## s.e. 0.0081 0.1065
##
## sigma^2 = 31.75: log likelihood = -1129.48
## AIC=2276.95 AICc=2277.47 BIC=2311.93
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.05253393 5.571598 4.13377 -10.44372 27.63248 0.7991509
## ACF1
## Training set -0.003537299
acf(final_mod3$residuals, lag=50) #looks okay
pacf(final_mod3$residuals, lag=50) #looks good
adf.test(final_mod3$residuals)
## Warning in adf.test(final_mod3$residuals): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: final_mod3$residuals
## Dickey-Fuller = -6.8737, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
pp.test(final_mod3$residuals)
## Warning in pp.test(final_mod3$residuals): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: final_mod3$residuals
## Dickey-Fuller Z(alpha) = -366.9, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(final_mod3$residuals)
## Warning in kpss.test(final_mod3$residuals): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: final_mod3$residuals
## KPSS Level = 0.15042, Truncation lag parameter = 5, p-value = 0.1
summary(ur.ers(final_mod3$residuals)) #passed all
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.989 -2.942 0.560 4.069 37.646
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.10442 0.04064 -2.569 0.0106 *
## yd.diff.lag1 -0.73078 0.06167 -11.850 < 2e-16 ***
## yd.diff.lag2 -0.54121 0.06941 -7.797 7.32e-14 ***
## yd.diff.lag3 -0.31351 0.06675 -4.697 3.80e-06 ***
## yd.diff.lag4 -0.11770 0.05278 -2.230 0.0264 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.08 on 350 degrees of freedom
## Multiple R-squared: 0.4164, Adjusted R-squared: 0.4081
## F-statistic: 49.94 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -2.5691
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
qqPlot(final_mod3$residuals) #concern with outliers on the right side but pretty normal
## [1] 351 352
shapiro.test(final_mod3$residuals) #very small, 2.2e-16. Did not passed
##
## Shapiro-Wilk normality test
##
## data: final_mod3$residuals
## W = 0.93361, p-value = 1.397e-11
#get R^2 of the model
prediction={}
for (i in 1:360){
prediction=append(final_mod3$fitted[i], prediction)
}
prediction=rev(prediction)
cv_df=data.frame("predict"=prediction, "real"=new_merged$asianhc)
cor(cv_df$real,cv_df$predict)^2 # R^2 = 0.5492432
## [1] 0.5492432
RMSE=sqrt(mean((cv_df$real - cv_df$predict)^2))
N_RMSE=RMSE/(max(cv_df$real)-min(cv_df$real))
round(RMSE , digits = 3)
## [1] 5.572
round(N_RMSE, digits = 3) # 0.116
## [1] 0.116
tsplot(new_merged$asianhc, lwd=2)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a graphical
## parameter
## Warning in rep(y, length.out = n): 'x' is NULL so the result will be NULL
lines(final_mod3$fitted, col="red", pch=50)
#cross-validation
a=new_merged$asianhc[1:336,] #including only through 2019
t=1:336
sarimaMod3=Arima(new_merged$asianhc[1:336,],order=c(2,0,1), xreg =cbind(new_merged$temp[1:336,], new_merged$cpi[1:336,], new_merged$non_rhc[1:336,], new_merged$time[1:336,]))
summary(sarimaMod3) # AICc=2055.28 BIC=2089.09
## Series: new_merged$asianhc[1:336, ]
## Regression with ARIMA(2,0,1) errors
##
## Coefficients:
## ar1 ar2 ma1 intercept new_merged$temp[1:336, ]
## 0.9966 -0.0548 -0.8024 36.1527 0.0303
## s.e. 0.1002 0.0731 0.0821 30.2853 0.0214
## new_merged$cpi[1:336, ] new_merged$non_rhc[1:336, ]
## -0.1537 0.0465
## s.e. 0.2306 0.0073
## new_merged$time[1:336, ]
## -0.0022
## s.e. 0.0827
##
## sigma^2 = 25.71: log likelihood = -1018.37
## AIC=2054.73 AICc=2055.28 BIC=2089.09
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.06622725 5.009553 3.920922 -9.415356 26.85912 0.7596929
## ACF1
## Training set -0.002104686
fcast4 <- forecast(sarimaMod3, xreg=cbind(rep(mean(new_merged$temp[1:336,]), 12), rep(mean(new_merged$cpi[1:336,]), 12), rep(mean(new_merged$non_rhc[1:336,]),12), rep(mean(new_merged$time[1:336,]),12)))
## Warning in forecast.forecast_ARIMA(sarimaMod3, xreg =
## cbind(rep(mean(new_merged$temp[1:336, : xreg contains different column names
## from the xreg used in training. Please check that the regressors are in the same
## order.
autoplot(fcast4)
#real data
real_value=new_merged$asianhc[c(337:348)]
#compare with real data
prediction2={}
for (i in 1:12){
print(fcast4$mean[i])
prediction2=append(fcast4$mean[i], prediction2)
}
## [1] 20.289
## [1] 19.80259
## [1] 19.64663
## [1] 19.51783
## [1] 19.39801
## [1] 19.28565
## [1] 19.18023
## [1] 19.08132
## [1] 18.98852
## [1] 18.90145
## [1] 18.81975
## [1] 18.74311
prediction2=rev(prediction2)
cv_df1=data.frame("predict"=prediction2, "real"=real_value)
cor(cv_df1$real,cv_df1$predict)^2 # R^2 = 0.3731521
## [1] 0.3731521
RMSE1=sqrt(mean((cv_df1$real - cv_df1$predict)^2))
N_RMSE1=RMSE1/(max(cv_df1$real)-min(cv_df1$real))
round(RMSE1 , digits = 3)
## [1] 4.737
round(N_RMSE1, digits = 3) #0.526
## [1] 0.526
#make prediction
xreg4_pred=forecast(final_mod3, xreg=cbind(rep(mean(new_merged$temp), 12), rep(mean(new_merged$cpi), 12), rep(mean(new_merged$non_rhc),12), rep(mean(new_merged$time),12)))
## Warning in forecast.forecast_ARIMA(final_mod3, xreg =
## cbind(rep(mean(new_merged$temp), : xreg contains different column names from the
## xreg used in training. Please check that the regressors are in the same order.
autoplot(xreg4_pred)+xlab("Time")+ylab("anti-Asian hate crimes")
xreg4_pred=forecast(final_mod3, xreg=cbind(rep(mean(new_merged$temp), 24), rep(mean(new_merged$cpi), 24), rep(mean(new_merged$non_rhc),24), rep(mean(new_merged$time),24)))
## Warning in forecast.forecast_ARIMA(final_mod3, xreg =
## cbind(rep(mean(new_merged$temp), : xreg contains different column names from the
## xreg used in training. Please check that the regressors are in the same order.
autoplot(xreg4_pred)+xlab("Time")+ylab("anti-Asian hate crimes")
hcR=diff(log(allHC$hate_crime))
tsplot(hcR, col="lightcoral")
blackhcR=diff(log(blackHC$hate_crime))
tsplot(blackhcR, col="lightcoral")
#GARCH for asian HC
asianhcR=diff(log(asianHC$hate_crime))
tsplot(asianhcR,col="lightcoral")
acf(asianhcR) #lag 1 sig, MA(1)
pacf(asianhcR) #lag 12 sig.several lags at front
ahcMod1=Arima(asianhcR, order=c(0,0,1))
acf(ahcMod1$residuals , lag=38) #looks good, almost no correlation
pacf(ahcMod1$residuals, lag=38) #lag 12, 22, 24 sig. Try SARIMA? Try simple way and see if GRACH fix it
tsplot(ahcMod1$residuals) #slight heteroskedasticity
res1 =ahcMod1$residuals
tsplot(res1^2)
acf(res1^2, lag=50) #lag 32 sig. 1/50, almost no autocorrelation. CAN'T USE GARCH!
pacf(res1^2, lag=50) #lag 31 sig. 1/50
auto.arima(res1^2) #suggest ARIMA(0,0,1)
## Series: res1^2
## ARIMA(0,0,1) with non-zero mean
##
## Coefficients:
## ma1 mean
## 0.0815 0.1102
## s.e. 0.0521 0.0110
##
## sigma^2 = 0.03738: log likelihood = 81.53
## AIC=-157.06 AICc=-156.99 BIC=-145.41
#try garch(1,0)
#ahcFit1=garchFit(~arma(0,1) + garch(1,0), data = asianhcR)
#summary(ahcFit1) #0.6664991 0.7097673
#plot(ahcFit1,which="all") #no significant autocorrelation remains at plot 10 and 11. Really small lag at lag 12. Residuals looks pretty normal but have outliers
#try garch(1,0)
#ahcFit2=garchFit(~arma(0,1) + garch(0,1), data = asianhcR) #Error, does not allow me to try garch(0,1)
#summary(ahcFit2) #0.6664991 0.7097673
#plot(ahcFit2,which="all") #no significant autocorrelation remains at plot 10 and 11. Really small lag at lag 12. Residuals looks pretty normal but have outliers
#try garch(1,1)
#ahcFit2=garchFit(~arma(0,1) + garch(1,1), data = asianhcR) #Error, does not allow me to try garch(0,1)
#summary(ahcFit2) #0.6720701 0.7261554 got worse
#plot(ahcFit2,which="all") #no significant autocorrelation remains at plot 10 and 11. Really small lag at lag 12. Residuals looks pretty normal but have outliers
#what if I fit SARIMA? Can I do this?
#plot(ahcFit1, which=3)
#ahcFit1$
tsplot(allHC$hate_crime)
acf(allHC$hate_crime, 50, main="all racial hate crime")
tsplot(blackHC$hate_crime)
acf(blackHC$hate_crime,50, main="anti-Black or African American hate crime")
tsplot(asianHC$hate_crime)
acf(asianHC$hate_crime,50, main="anti-Asian hate crime")
library(arfima)
## Loading required package: ltsa
## Note that the arfima package has new defaults starting with
## 1.4-0: type arfimachanges() for a list, as well as some other notes.
## NOTE: some of these are quite important!
##
## Attaching package: 'arfima'
## The following object is masked from 'package:forecast':
##
## arfima
## The following object is masked from 'package:stats':
##
## BIC
summary(arfimaMod1 <- arfima(allHC$hate_crime, order = c(0,0,0))) #d=0.4757504
## Note: only one starting point. Only one mode can be found -- this is now the default behavior.
## Beginning the fits with 1 starting values.
##
## Call:
##
## arfima(z = allHC$hate_crime, order = c(0, 0, 0))
##
##
## Mode 1 Coefficients:
## Estimate Std. Error Th. Std. Err. z-value Pr(>|z|)
## d.f 0.4757504 0.0275354 0.0410974 17.2778 < 2e-16 ***
## Fitted mean 389.0534119 189.9210362 NA 2.0485 0.040511 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma^2 estimated as 7050.6; Log-likelihood = -1595.65; AIC = 3197.3; BIC = 3208.96
##
## Numerical Correlations of Coefficients:
## d.f Fitted mean
## d.f 1.00 0.02
## Fitted mean 0.02 1.00
##
## Theoretical Correlations of Coefficients:
## d.f
## d.f 1.00
##
## Expected Fisher Information Matrix of Coefficients:
## d.f
## d.f 1.64
summary(arfimaMod2 <- arfima(blackHC$hate_crime, order = c(0,0,0))) #d=0.4911473
## Note: only one starting point. Only one mode can be found -- this is now the default behavior.
## Beginning the fits with 1 starting values.
##
## Call:
##
## arfima(z = blackHC$hate_crime, order = c(0, 0, 0))
##
##
## Mode 1 Coefficients:
## Estimate Std. Error Th. Std. Err. z-value Pr(>|z|)
## d.f 0.4911473 0.0118870 0.0410974 41.3181 < 2e-16 ***
## Fitted mean 207.6410160 166.7932852 NA 1.2449 0.21317
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma^2 estimated as 1702.71; Log-likelihood = -1340.42; AIC = 2686.85; BIC = 2698.5
##
## Numerical Correlations of Coefficients:
## d.f Fitted mean
## d.f 1.00 0.03
## Fitted mean 0.03 1.00
##
## Theoretical Correlations of Coefficients:
## d.f
## d.f 1.00
##
## Expected Fisher Information Matrix of Coefficients:
## d.f
## d.f 1.64
summary(arfimaMod3 <- arfima(asianHC$hate_crime, order = c(0,0,0))) #d=0.3988466
## Note: only one starting point. Only one mode can be found -- this is now the default behavior.
## Beginning the fits with 1 starting values.
##
## Call:
##
## arfima(z = asianHC$hate_crime, order = c(0, 0, 0))
##
##
## Mode 1 Coefficients:
## Estimate Std. Error Th. Std. Err. z-value Pr(>|z|)
## d.f 0.3988466 0.0352278 0.0410974 11.32192 < 2.22e-16 ***
## Fitted mean 18.1603855 4.4192760 NA 4.10936 3.9676e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma^2 estimated as 34.2928; Log-likelihood = -636.143; AIC = 1278.29; BIC = 1289.95
##
## Numerical Correlations of Coefficients:
## d.f Fitted mean
## d.f 1.00 0.00
## Fitted mean 0.00 1.00
##
## Theoretical Correlations of Coefficients:
## d.f
## d.f 1.00
##
## Expected Fisher Information Matrix of Coefficients:
## d.f
## d.f 1.64
r = resid(arfimaMod1)[[1]]
acf(r, 50) # seasonal component remaining
r2 = resid(arfimaMod2)[[1]]
acf(r2, 50) # seasonal component remaining
#Seems like asianHC could use ARFIMA
r3 = resid(arfimaMod3)[[1]]
tsplot(r3)
acf(r3, 50) # looks good but really small lags in 12 period
pacf(r3, 50) #looks good but 12, 23 concerning but small
adf.test(r3)
## Warning in adf.test(r3): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: r3
## Dickey-Fuller = -6.5764, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(r3) #fail
##
## KPSS Test for Level Stationarity
##
## data: r3
## KPSS Level = 0.53845, Truncation lag parameter = 5, p-value = 0.03301
pp.test(r3)
## Warning in pp.test(r3): p-value smaller than printed p-value
##
## Phillips-Perron Unit Root Test
##
## data: r3
## Dickey-Fuller Z(alpha) = -369.42, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
summary(ur.ers(r3))
##
## ###############################################
## # Elliot, Rothenberg and Stock Unit Root Test #
## ###############################################
##
## Test of type DF-GLS
## detrending of series with intercept
##
##
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.344 -2.930 0.855 4.632 37.265
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## yd.lag -0.19313 0.05577 -3.463 0.0006 ***
## yd.diff.lag1 -0.67702 0.06840 -9.899 < 2e-16 ***
## yd.diff.lag2 -0.52171 0.07279 -7.167 4.57e-12 ***
## yd.diff.lag3 -0.30353 0.06808 -4.459 1.11e-05 ***
## yd.diff.lag4 -0.12892 0.05273 -2.445 0.0150 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.337 on 350 degrees of freedom
## Multiple R-squared: 0.4369, Adjusted R-squared: 0.4288
## F-statistic: 54.3 on 5 and 350 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -3.4631
##
## Critical values of DF-GLS are:
## 1pct 5pct 10pct
## critical values -2.57 -1.94 -1.62
qqPlot(r3)
## [1] 351 352
#compare with ARIMA way
daHC=diff(asianHC$hate_crime)
acf(daHC) #MA(1)
pacf(daHC) #a lot of lags, tailing off like an MA(2)?
auto.arima(daHC) #(2,0,2) too complicated, overdifferenced!
## Series: daHC
## ARIMA(2,0,2) with zero mean
##
## Coefficients:
## ar1 ar2 ma1 ma2
## 0.5687 -0.0288 -1.2067 0.2743
## s.e. 0.5477 0.1608 0.5446 0.4839
##
## sigma^2 = 34.17: log likelihood = -1141.78
## AIC=2293.57 AICc=2293.74 BIC=2312.99
#prediction on 2021 and 2022
fcast_arfima=predict(arfimaMod3, n.ahead=24)
fcast_arfima
## $`Mode 1`
## $`Mode 1`$`Forecasts and SDs`
## 1 2 3 4 5 6 7
## Forecasts 18.17083 18.59330 18.71884 18.73874 18.71289 18.66593 18.60949
## Exact SD 5.85730 6.30696 6.51608 6.64662 6.73950 6.81066 6.86786
## Limiting SD 5.85601 6.30461 6.51282 6.64252 6.73462 6.80504 6.86153
## 8 9 10 11 12 13 14
## Forecasts 18.54950 18.48913 18.43007 18.37323 18.31908 18.26780 18.21943
## Exact SD 6.91540 6.95588 6.99102 7.02197 7.04957 7.07444 7.09702
## Limiting SD 6.90838 6.94821 6.98270 7.01303 7.04003 7.06430 7.08631
## 15 16 17 18 19 20 21
## Forecasts 18.17392 18.13115 18.09101 18.05335 18.01802 17.98487 17.95376
## Exact SD 7.11769 7.13672 7.15433 7.17071 7.18601 7.20035 7.21384
## Limiting SD 7.10640 7.12487 7.14194 7.15778 7.17255 7.18637 7.19935
## 22 23 24
## Forecasts 17.92456 17.89714 17.87139
## Exact SD 7.22657 7.23862 7.25005
## Limiting SD 7.21158 7.22312 7.23406
final_predict=data.frame("fit"=c( 18.17083, 18.59330, 18.71884, 18.73874 ,18.71289 ,18.66593 ,18.60949, 18.54950, 18.48913, 18.43007, 18.37323 ,18.31908, 18.26780 ,18.21943 ,18.17392, 18.13115, 18.09101 ,18.05335 ,18.01802, 17.98487, 17.95376 ,17.92456, 17.89714, 17.87139), "sd"=c( 5.85730, 6.30696, 6.51608 , 6.64662 , 6.73950 , 6.81066 ,6.86786 , 6.91540, 6.95588 ,6.99102 , 7.02197 ,7.04957 , 7.07444 ,7.09702 ,7.11769 , 7.13672, 7.15433 , 7.17071 ,7.18601 ,7.20035 , 7.21384, 7.22657 , 7.23862 ,7.25005))
fitted=ts(final_predict$fit, start=c(2021,1), frequency=12)
fit_df=data.frame("lwr"=rep(0,24), "upr"=rep(0,24))
for (i in 1:24){
fit_df$lwr[i]=final_predict$fit[i]-1.96*final_predict$sd[i]
fit_df$upr[i]=final_predict$fit[i]+1.96*final_predict$sd[i]
}
lower_list=ts(fit_df$lwr,start=c(2021,1),frequency =12)
upper_list=ts(fit_df$upr,start=c(2021,1),frequency =12)
#NA_list=ts(rep(NA, 12),start=c(2017,7),frequency =12)
#merged_ts <- ts(c(salmon, NA_list),
# start = start(salmon),
# frequency = frequency(salmon))
asianHC_ts=ts(asianHC$hate_crime, start=c(1991,1), frequency=12)
NA_list=ts(rep(NA, 24),start=c(2021,1),frequency =12)
merged_ts <- ts(c(asianHC_ts, NA_list),
start = start(asianHC_ts),
frequency = frequency(asianHC_ts))
tsplot(merged_ts,xlim=c())
lines(fitted, start=c(2021,1), end=c(2022,12), col="red")
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "start" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "end" is not a graphical
## parameter
abline(h=0, col=8) # makes a horizontal line y = 0
lines(lower_list, col="blue")
abline(h=0, col=8)
lines(upper_list, col="blue")
#ARFIMA on seasonal diff. data??
acf(diff(allHC$hate_crime, lag=12))
acf(diff(blackHC$hate_crime, lag=12))
acf(diff(asianHC$hate_crime, lag=12))
summary(arfimaMod1.2 <- arfima(diff(allHC$hate_crime, lag=12), order = c(0,0,0))) #d=0.3878510
## Note: only one starting point. Only one mode can be found -- this is now the default behavior.
## Beginning the fits with 1 starting values.
##
## Call:
##
## arfima(z = diff(allHC$hate_crime, lag = 12), order = c(0, 0, 0))
##
##
## Mode 1 Coefficients:
## Estimate Std. Error Th. Std. Err. z-value Pr(>|z|)
## d.f 0.3878510 0.0445982 0.0417971 8.69656 < 2e-16 ***
## Fitted mean 9.1924449 64.1638528 NA 0.14327 0.88608
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma^2 estimated as 8517.79; Log-likelihood = -1574.47; AIC = 3154.94; BIC = 3166.5
##
## Numerical Correlations of Coefficients:
## d.f Fitted mean
## d.f 1.00 -0.20
## Fitted mean -0.20 1.00
##
## Theoretical Correlations of Coefficients:
## d.f
## d.f 1.00
##
## Expected Fisher Information Matrix of Coefficients:
## d.f
## d.f 1.64
summary(arfimaMod2.2 <- arfima(diff(blackHC$hate_crime, lag=12), order = c(0,0,0))) #d=0.4749521
## Note: only one starting point. Only one mode can be found -- this is now the default behavior.
## Beginning the fits with 1 starting values.
##
## Call:
##
## arfima(z = diff(blackHC$hate_crime, lag = 12), order = c(0, 0, 0))
##
##
## Mode 1 Coefficients:
## Estimate Std. Error Th. Std. Err. z-value Pr(>|z|)
## d.f 0.4749521 0.0284903 0.0417971 16.67063 < 2e-16 ***
## Fitted mean 5.8687671 91.1756546 NA 0.06437 0.94868
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma^2 estimated as 1617.35; Log-likelihood = -1286.28; AIC = 2578.56; BIC = 2590.11
##
## Numerical Correlations of Coefficients:
## d.f Fitted mean
## d.f 1.00 -0.21
## Fitted mean -0.21 1.00
##
## Theoretical Correlations of Coefficients:
## d.f
## d.f 1.00
##
## Expected Fisher Information Matrix of Coefficients:
## d.f
## d.f 1.64
summary(arfimaMod3.2 <- arfima(diff(asianHC$hate_crime, lag=12), order = c(0,0,0))) #d=0.2416897
## Note: only one starting point. Only one mode can be found -- this is now the default behavior.
## Beginning the fits with 1 starting values.
##
## Call:
##
## arfima(z = diff(asianHC$hate_crime, lag = 12), order = c(0, 0, 0))
##
##
## Mode 1 Coefficients:
## Estimate Std. Error Th. Std. Err. z-value Pr(>|z|)
## d.f 0.2416897 0.0435546 0.0417971 5.54912 2.871e-08 ***
## Fitted mean 0.4908735 1.6364596 NA 0.29996 0.76421
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma^2 estimated as 53.5331; Log-likelihood = -691.821; AIC = 1389.64; BIC = 1401.2
##
## Numerical Correlations of Coefficients:
## d.f Fitted mean
## d.f 1.00 0.04
## Fitted mean 0.04 1.00
##
## Theoretical Correlations of Coefficients:
## d.f
## d.f 1.00
##
## Expected Fisher Information Matrix of Coefficients:
## d.f
## d.f 1.64
r.2 = resid(arfimaMod1.2)[[1]]
acf(r.2, 50) #huge lag at 12.
pacf(r.2, 50) #12, 24
r2.2 = resid(arfimaMod2.2)[[1]]
acf(r2.2, 50) #huge lag at 12
pacf(r2.2, 50) #12, 1
r3.2 = resid(arfimaMod3.2)[[1]]
acf(r3.2, 50) #huge lag at 12
pacf(r3.2,50) #some seasonal lags
#trial cases: can I fit SARIMA to residual?
trail_allHC <- arfima(diff(allHC$hate_crime, lag=12), order = c(1,0,0))
## Note: only one starting point. Only one mode can be found -- this is now the default behavior.
## Beginning the fits with 1 starting values.
trail_allHC
## Number of modes: 1
##
## Call:
## arfima(z = diff(allHC$hate_crime, lag = 12), order = c(1, 0, 0))
##
## Coefficients for fits:
## Coef.1: SE.1:
## phi(1) 0.139969 0.117196
## d.f 0.291948 0.0967529
## Fitted mean 9.22225 34.9748
## logl -1573.55
## sigma^2 8514.31
## Starred fits are close to invertibility/stationarity boundaries
tsplot(resid(trail_allHC)[[1]])
acf(resid(trail_allHC)[[1]]) #still, lag 12 concerned
#compare
compare1=auto.arima(diff(asianHC$hate_crime, lag=12)) #(1,1)
acf(compare1$residuals) #still, lag 12
pacf(compare1$residuals) #still, lag 12
trail_blackHC <- arfima(diff(blackHC$hate_crime, lag=12), order = c(1,0,0))
## Note: only one starting point. Only one mode can be found -- this is now the default behavior.
## Beginning the fits with 1 starting values.
trail_allHC
## Number of modes: 1
##
## Call:
## arfima(z = diff(allHC$hate_crime, lag = 12), order = c(1, 0, 0))
##
## Coefficients for fits:
## Coef.1: SE.1:
## phi(1) 0.139969 0.117196
## d.f 0.291948 0.0967529
## Fitted mean 9.22225 34.9748
## logl -1573.55
## sigma^2 8514.31
## Starred fits are close to invertibility/stationarity boundaries
tsplot(resid(trail_allHC)[[1]])
acf(resid(trail_allHC)[[1]]) #still, lag 12 concerned
trail_asianHC <- arfima(diff(asianHC$hate_crime, lag=12), order = c(1,0,0))
## Note: only one starting point. Only one mode can be found -- this is now the default behavior.
## Beginning the fits with 1 starting values.
trail_asianHC
## Number of modes: 1
##
## Call:
## arfima(z = diff(asianHC$hate_crime, lag = 12), order = c(1, 0, 0))
##
## Coefficients for fits:
## Coef.1: SE.1:
## phi(1) 0.00604986 0.0973575
## d.f 0.237606 0.078903
## Fitted mean 0.482695 1.60599
## logl -691.819
## sigma^2 53.69
## Starred fits are close to invertibility/stationarity boundaries
tsplot(resid(trail_asianHC)[[1]])
acf(resid(trail_asianHC)[[1]]) #still, lag 12 concerned
autoMod = auto.arima(diff(asianHC$hate_crime, lag=12))
summary(autoMod)
## Series: diff(asianHC$hate_crime, lag = 12)
## ARIMA(1,0,1) with zero mean
##
## Coefficients:
## ar1 ma1
## 0.7439 -0.5025
## s.e. 0.0860 0.1099
##
## sigma^2 = 53.04: log likelihood = -1183.82
## AIC=2373.64 AICc=2373.71 BIC=2385.2
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.07539184 7.261722 5.429604 NaN Inf 0.7702667 -0.006083782
acf(autoMod$residuals)
pacf(autoMod$residuals)
#In general, ARFIMA on seasonally differenced data have concern remaining with significant lag at 12. This could be fixed with using SARIMA. However, no info given on those.
#CCF with non racial hate crimes and racial hate crimes?
#SARIMA to non racial hate crimes?
#boot strap forecast interval of SARIMA model.
#CV, SARIMA and Xreg
#best SARIMA model
mod9HC=Arima(allHC$hate_crime,order=c(2,0,1), seasonal=list(order=c(0,1,1),period=12), include.drift = FALSE)
#summary(mod9HC) #AIC=4017.1 AICc=4017.27 BIC=4036.36
#compare forecasting performance of two models
a=allHC[1:312,] #including only through 2016
t=1:312
sarimaMod_allHC=Arima(a$hate_crime,order=c(2,0,1), seasonal=list(order=c(0,1,1),period=12), include.drift = FALSE)
#summary(sarimaMod) #AIC=3780.98 AICc=3781.16 BIC=3800.06
sarimaPred=forecast(sarimaMod_allHC,h=48)$mean
#real data
real_value=allHC$hate_crime[c(313:360)]
#compare with real data
cor(real_value,sarimaPred)^2 # R^2 = 0.1606327
## [1] 0.1606327
RMSE1=sqrt(mean((real_value - sarimaPred)^2))
N_RMSE1=RMSE1/(max(real_value)-min(real_value))
round(RMSE1 , digits = 3)
## [1] 149.781
round(N_RMSE1, digits = 3) #0.172
## [1] 0.172
#cross-validation
a=new_merged$hc[1:312,] #including only through 2018
sarimaMod=Arima(new_merged$hc[1:312,],order=c(1,0,1), xreg =cbind(new_merged$temp[1:312,], new_merged$unemp[1:312,], new_merged$cpi[1:312,], new_merged$non_rhc[1:312,]))
summary(sarimaMod) #AICc=3592.9 BIC=3622.99
## Series: new_merged$hc[1:312, ]
## Regression with ARIMA(1,0,1) errors
##
## Coefficients:
## ar1 ma1 intercept new_merged$temp[1:312, ]
## 0.928 -0.7376 354.5704 1.4847
## s.e. 0.042 0.0674 68.9848 0.2074
## new_merged$unemp[1:312, ] new_merged$cpi[1:312, ]
## 0.4911 -1.7716
## s.e. 5.1580 0.3061
## new_merged$non_rhc[1:312, ]
## 1.3545
## s.e. 0.0751
##
## sigma^2 = 2562: log likelihood = -1663.74
## AIC=3343.49 AICc=3343.96 BIC=3373.43
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.9564581 50.0412 36.81672 -0.8859321 9.639288 0.7367609
## ACF1
## Training set 0.02037902
fcast2 <- forecast(sarimaMod, xreg=cbind(rep(mean(new_merged$temp[1:312,]), 48), rep(mean(new_merged$unemp[1:312,]), 48), rep(mean(new_merged$cpi[1:312,]),48), rep(mean(new_merged$non_rhc[1:312,]),48)))
## Warning in forecast.forecast_ARIMA(sarimaMod, xreg =
## cbind(rep(mean(new_merged$temp[1:312, : xreg contains different column names
## from the xreg used in training. Please check that the regressors are in the same
## order.
autoplot(fcast2)
#real data
real_value=new_merged$hc[c(313:360)]
#compare with real data
prediction2={}
for (i in 1:48){
print(fcast2$mean[i])
prediction2=append(fcast2$mean[i], prediction2)
}
## [1] 377.3941
## [1] 377.9822
## [1] 378.528
## [1] 379.0344
## [1] 379.5044
## [1] 379.9406
## [1] 380.3453
## [1] 380.721
## [1] 381.0695
## [1] 381.393
## [1] 381.6932
## [1] 381.9717
## [1] 382.2303
## [1] 382.4702
## [1] 382.6928
## [1] 382.8994
## [1] 383.0911
## [1] 383.269
## [1] 383.4341
## [1] 383.5873
## [1] 383.7295
## [1] 383.8615
## [1] 383.9839
## [1] 384.0975
## [1] 384.203
## [1] 384.3009
## [1] 384.3917
## [1] 384.4759
## [1] 384.5541
## [1] 384.6267
## [1] 384.6941
## [1] 384.7566
## [1] 384.8146
## [1] 384.8684
## [1] 384.9183
## [1] 384.9647
## [1] 385.0077
## [1] 385.0476
## [1] 385.0847
## [1] 385.119
## [1] 385.1509
## [1] 385.1805
## [1] 385.208
## [1] 385.2335
## [1] 385.2572
## [1] 385.2791
## [1] 385.2995
## [1] 385.3184
prediction2=rev(prediction2)
cv_df1=data.frame("predict"=prediction2, "real"=real_value)
cor(cv_df1$real,cv_df1$predict)^2 # R^2 = 0.1112606
## [1] 0.1112606
RMSE1=sqrt(mean((cv_df1$real - cv_df1$predict)^2))
N_RMSE1=RMSE1/(max(cv_df1$real)-min(cv_df1$real))
round(RMSE1 , digits = 3)
## [1] 149.678
round(N_RMSE1, digits = 3) #0.172
## [1] 0.172
#CV
#compare forecasting performance of two models
b=blackHC[1:312,] #including only through 2019
t=1:312
sarimaModB=Arima(b$hate_crime,order=c(0,1,2), seasonal=list(order=c(0,1,1),period=12), include.drift = FALSE)
#summary(sarimaModB) #AIC=3072.28 AICc=3072.4 BIC=3087.53
sarimaBPred=forecast(sarimaModB,h=48)$mean
#real data
real_value2=blackHC$hate_crime[c(313:360)]
#compare with real data
cor(real_value2,sarimaBPred)^2 # R^2 = 0.1172818
## [1] 0.1172818
#cross-validation
a=new_merged$blackhc[1:312,] #including only through 2019
t=1:312
sarimaMod2=Arima(new_merged$blackhc[1:312,],order=c(2,0,1), xreg =cbind(new_merged$temp[1:312,], new_merged$unemp[1:312,], new_merged$cpi[1:312,], new_merged$non_rhc[1:312,]))
summary(sarimaMod2) #AICc=3718.89 BIC=3749.28
## Series: new_merged$blackhc[1:312, ]
## Regression with ARIMA(2,0,1) errors
##
## Coefficients:
## ar1 ar2 ma1 intercept new_merged$temp[1:312, ]
## 1.1880 -0.1953 -0.7596 -63.0393 1.1456
## s.e. 0.0836 0.0813 0.0549 267.0067 0.1282
## new_merged$unemp[1:312, ] new_merged$cpi[1:312, ]
## 3.1408 0.6119
## s.e. 3.6133 1.2973
## new_merged$non_rhc[1:312, ]
## 0.2551
## s.e. 0.0393
##
## sigma^2 = 693.8: log likelihood = -1460.35
## AIC=2938.71 AICc=2939.3 BIC=2972.39
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.09864195 26.00008 20.1361 -1.235001 9.849989 0.745604
## ACF1
## Training set 0.002420022
fcast3 <- forecast(sarimaMod2, xreg=cbind(rep(mean(new_merged$temp[1:312,]), 48), rep(mean(new_merged$unemp[1:312,]), 48), rep(mean(new_merged$cpi[1:312,]),48), rep(mean(new_merged$non_rhc[1:312,]),48)))
## Warning in forecast.forecast_ARIMA(sarimaMod2, xreg =
## cbind(rep(mean(new_merged$temp[1:312, : xreg contains different column names
## from the xreg used in training. Please check that the regressors are in the same
## order.
autoplot(fcast3)
#real data
real_value=new_merged$blackhc[c(313:360)]
#compare with real data
prediction2={}
for (i in 1:48){
print(fcast3$mean[i])
prediction2=append(fcast3$mean[i], prediction2)
}
## [1] 125.9071
## [1] 126.8751
## [1] 127.4958
## [1] 128.0442
## [1] 128.5745
## [1] 129.0974
## [1] 129.615
## [1] 130.1279
## [1] 130.6362
## [1] 131.1398
## [1] 131.6389
## [1] 132.1335
## [1] 132.6237
## [1] 133.1094
## [1] 133.5907
## [1] 134.0677
## [1] 134.5403
## [1] 135.0087
## [1] 135.4729
## [1] 135.9329
## [1] 136.3887
## [1] 136.8405
## [1] 137.2881
## [1] 137.7317
## [1] 138.1713
## [1] 138.6069
## [1] 139.0386
## [1] 139.4664
## [1] 139.8904
## [1] 140.3105
## [1] 140.7268
## [1] 141.1394
## [1] 141.5482
## [1] 141.9533
## [1] 142.3548
## [1] 142.7527
## [1] 143.147
## [1] 143.5377
## [1] 143.9249
## [1] 144.3086
## [1] 144.6888
## [1] 145.0656
## [1] 145.439
## [1] 145.809
## [1] 146.1757
## [1] 146.5391
## [1] 146.8992
## [1] 147.256
prediction2=rev(prediction2)
cv_df1=data.frame("predict"=prediction2, "real"=real_value)
cor(cv_df1$real,cv_df1$predict)^2 # R^2 = 0.2034128
## [1] 0.2034128
RMSE1=sqrt(mean((cv_df1$real - cv_df1$predict)^2))
N_RMSE1=RMSE1/(max(cv_df1$real)-min(cv_df1$real))
round(RMSE1 , digits = 3)
## [1] 115.105
round(N_RMSE1, digits = 3)
## [1] 0.199
#CV
#best SARIMA model for the diff data:
mod10A=Arima(asianHC$hate_crime,order=c(1,1,1), seasonal=list(order=c(1,0,1),period=12), include.drift =FALSE)
#summary(mod10A) #AIC=2285.7 AICc=2285.87 BIC=2305.12
#compare forecasting performance of two models
c=asianHC[1:312,] #including only through 2019
t=1:312
sarimaModA=Arima(c$hate_crime,order=c(1,1,1), seasonal=list(order=c(1,0,1),period=12), include.drift = FALSE)
#summary(sarimaModA) #AIC=2148.5 AICc=2148.67 BIC=2167.74
sarimaAPred=forecast(sarimaModA,h=48)$mean
#real data
real_value3=asianHC$hate_crime[c(313:360)]
#compare with real data
cor(real_value3,sarimaAPred)^2 # R^2 =[1] 0.03355379, might be due to surge in 2020?
## [1] 0.03355379
#cross-validation
a=new_merged$asianhc[1:312,] #including only through 2019
t=1:312
sarimaMod3=Arima(new_merged$asianhc[1:312,],order=c(2,0,1), xreg =cbind(new_merged$temp[1:312,], new_merged$cpi[1:312,], new_merged$non_rhc[1:312,], new_merged$time[1:312,]))
summary(sarimaMod3) # AICc=2055.28 BIC=2089.09
## Series: new_merged$asianhc[1:312, ]
## Regression with ARIMA(2,0,1) errors
##
## Coefficients:
## ar1 ar2 ma1 intercept new_merged$temp[1:312, ]
## 1.0259 -0.0828 -0.8119 33.0759 0.0306
## s.e. 0.1030 0.0757 0.0832 30.4239 0.0229
## new_merged$cpi[1:312, ] new_merged$non_rhc[1:312, ]
## -0.1300 0.0487
## s.e. 0.2322 0.0075
## new_merged$time[1:312, ]
## -0.0152
## s.e. 0.0841
##
## sigma^2 = 26.29: log likelihood = -948.83
## AIC=1915.66 AICc=1916.26 BIC=1949.35
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.07171987 5.061236 3.963485 -9.067421 26.34634 0.763247
## ACF1
## Training set -0.002219476
fcast4 <- forecast(sarimaMod3, xreg=cbind(rep(mean(new_merged$temp[1:312,]), 48), rep(mean(new_merged$cpi[1:312,]), 48), rep(mean(new_merged$non_rhc[1:312,]),48), rep(mean(new_merged$time[1:312,]),48)))
## Warning in forecast.forecast_ARIMA(sarimaMod3, xreg =
## cbind(rep(mean(new_merged$temp[1:312, : xreg contains different column names
## from the xreg used in training. Please check that the regressors are in the same
## order.
autoplot(fcast4)
#real data
real_value=new_merged$asianhc[c(313:360)]
#compare with real data
prediction2={}
for (i in 1:48){
print(fcast4$mean[i])
prediction2=append(fcast4$mean[i], prediction2)
}
## [1] 18.36
## [1] 18.11921
## [1] 18.0885
## [1] 18.07694
## [1] 18.06763
## [1] 18.05903
## [1] 18.05098
## [1] 18.04343
## [1] 18.03636
## [1] 18.02972
## [1] 18.0235
## [1] 18.01767
## [1] 18.0122
## [1] 18.00707
## [1] 18.00227
## [1] 17.99776
## [1] 17.99353
## [1] 17.98957
## [1] 17.98586
## [1] 17.98237
## [1] 17.97911
## [1] 17.97605
## [1] 17.97318
## [1] 17.97048
## [1] 17.96796
## [1] 17.96559
## [1] 17.96338
## [1] 17.9613
## [1] 17.95934
## [1] 17.95752
## [1] 17.9558
## [1] 17.95419
## [1] 17.95269
## [1] 17.95127
## [1] 17.94995
## [1] 17.94871
## [1] 17.94754
## [1] 17.94645
## [1] 17.94543
## [1] 17.94447
## [1] 17.94357
## [1] 17.94272
## [1] 17.94193
## [1] 17.94119
## [1] 17.94049
## [1] 17.93984
## [1] 17.93923
## [1] 17.93866
prediction2=rev(prediction2)
cv_df1=data.frame("predict"=prediction2, "real"=real_value)
cor(cv_df1$real,cv_df1$predict)^2 # R^2 = 0.1798148
## [1] 0.1798148
#for asian hate crime, try CV using the arfima model
t=1:312
arfimaMod_ahc=arfima(asianHC$hate_crime[1:312], order = c(0,0,0))
## Note: only one starting point. Only one mode can be found -- this is now the default behavior.
## Beginning the fits with 1 starting values.
summary(arfimaMod_ahc) #AICc=3718.89 BIC=3749.28
##
## Call:
##
## arfima(z = asianHC$hate_crime[1:312], order = c(0, 0, 0))
##
##
## Mode 1 Coefficients:
## Estimate Std. Error Th. Std. Err. z-value Pr(>|z|)
## d.f 0.3785830 0.0350286 0.0441362 10.80784 < 2.22e-16 ***
## Fitted mean 17.3546586 3.5608988 NA 4.87367 1.0954e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## sigma^2 estimated as 31.5065; Log-likelihood = -537.957; AIC = 1081.91; BIC = 1093.14
##
## Numerical Correlations of Coefficients:
## d.f Fitted mean
## d.f 1.00 -0.03
## Fitted mean -0.03 1.00
##
## Theoretical Correlations of Coefficients:
## d.f
## d.f 1.00
##
## Expected Fisher Information Matrix of Coefficients:
## d.f
## d.f 1.65
fcast2 <- predict(arfimaMod_ahc, n.ahead=48)
fcast2
## $`Mode 1`
## $`Mode 1`$`Forecasts and SDs`
## 1 2 3 4 5 6 7
## Forecasts 12.18301 12.25992 12.36044 12.46772 12.57443 12.67774 12.77663
## Exact SD 5.61435 6.00413 6.18113 6.29011 6.36689 6.42528 6.47193
## Limiting SD 5.61306 6.00185 6.17800 6.28621 6.36228 6.42001 6.46602
## 8 9 10 11 12 13 14
## Forecasts 12.87084 12.96046 13.04571 13.12687 13.20421 13.27802 13.34856
## Exact SD 6.51050 6.54320 6.57147 6.59629 6.61835 6.63817 6.65613
## Limiting SD 6.50399 6.53611 6.56382 6.58809 6.60963 6.62893 6.64639
## 15 16 17 18 19 20 21
## Forecasts 13.41606 13.48074 13.54280 13.60242 13.65977 13.71499 13.76822
## Exact SD 6.67251 6.68757 6.70147 6.71438 6.72642 6.73768 6.74825
## Limiting SD 6.66229 6.67686 6.69030 6.70275 6.71433 6.72515 6.73529
## 22 23 24 25 26 27 28
## Forecasts 13.81959 13.86921 13.91718 13.96359 14.00853 14.05209 14.09433
## Exact SD 6.75822 6.76763 6.77655 6.78501 6.79307 6.80075 6.80808
## Limiting SD 6.74483 6.75382 6.76232 6.77038 6.77804 6.78532 6.79227
## 29 30 31 32 33 34 35
## Forecasts 14.1353 14.17515 14.21384 14.25147 14.28807 14.32370 14.35841
## Exact SD 6.8151 6.82183 6.82828 6.83449 6.84046 6.84621 6.85176
## Limiting SD 6.7989 6.80525 6.81133 6.81716 6.82277 6.82816 6.83335
## 36 37 38 39 40 41 42
## Forecasts 14.39223 14.42520 14.45736 14.48874 14.51937 14.54929 14.57852
## Exact SD 6.85712 6.86229 6.86731 6.87216 6.87686 6.88143 6.88586
## Limiting SD 6.83836 6.84319 6.84786 6.85237 6.85674 6.86097 6.86507
## 43 44 45 46 47 48
## Forecasts 14.60709 14.63502 14.66234 14.68907 14.71522 14.7408
## Exact SD 6.89016 6.89434 6.89841 6.90237 6.90623 6.9100
## Limiting SD 6.86905 6.87291 6.87666 6.88030 6.88385 6.8873
predictedDF=data.frame("fit"= c(12.18301, 12.25992, 12.36044, 12.46772, 12.57443, 12.67774 ,12.77663, 12.87084, 12.96046, 13.04571, 13.12687, 13.20421, 13.27802, 13.34856, 13.41606, 13.48074, 13.54280, 13.60242, 13.65977, 13.71499, 13.76822, 13.81959, 13.86921, 13.91718, 13.96359, 14.00853, 14.05209, 14.09433, 14.1353, 14.17515, 14.21384, 14.25147,14.28807, 14.32370, 14.35841 ,14.39223, 14.42520 ,14.45736 ,14.48874, 14.51937 ,14.54929, 14.57852, 14.60709 ,14.63502 ,14.66234, 14.68907 ,14.71522, 14.7408))
#real data
real_value=asianHC$hate_crime[c(313:360)]
#compare with real data
cv_df1=data.frame("predict"=predictedDF$fit, "real"=real_value)
cor(cv_df1$real,cv_df1$predict)^2 # R^2 = 0.302834
## [1] 0.302834